From 966a5b9b0734dd4ef370c5f11d353c67a4ecb528 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 16 Oct 2025 16:35:43 -0700
Subject: [PATCH 01/59] Changed VERSION to 2.9.0

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
---
 build_tools/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
index 8bfb1cae85..c8e38b6140 100644
--- a/build_tools/VERSION.txt
+++ b/build_tools/VERSION.txt
@@ -1 +1 @@
-2.9.0.dev0
+2.9.0

From 739c6565b10f8c70f9e0c6e86e50f027384999f5 Mon Sep 17 00:00:00 2001
From: Kshitij Lakhani <33047503+KshitijLakhani@users.noreply.github.com>
Date: Thu, 16 Oct 2025 20:45:47 -0700
Subject: [PATCH 02/59] [JAX] Fix imports in test for deprecated
 jax.experimental.pjit (#2274)

* Fix imports in test for deprecated jax.experimental.pjit

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix: Pass NamedSharding instead of PartitionSpec to compare_ops() so that when the in and out sharding is used to create a jitted function, it has the mesh info

Signed-off-by: Kshitij  Janardan Lakhani <klakhani@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>
Signed-off-by: Kshitij  Janardan Lakhani <klakhani@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Kshitij  Janardan Lakhani <klakhani@login-eos01.eos.clusters.nvidia.com>
---
 tests/jax/distributed_test_base.py      | 14 +++++++------
 tests/jax/test_distributed_layernorm.py | 26 ++++++++++++++++---------
 tests/jax/test_distributed_softmax.py   | 10 ++++++----
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/tests/jax/distributed_test_base.py b/tests/jax/distributed_test_base.py
index 4693086b83..137fa480dd 100644
--- a/tests/jax/distributed_test_base.py
+++ b/tests/jax/distributed_test_base.py
@@ -8,7 +8,7 @@
 import pytest
 
 import jax
-from jax.experimental.pjit import pjit, _UNSPECIFIED
+from jax._src.sharding_impls import UNSPECIFIED as _UNSPECIFIED
 
 from transformer_engine.jax.sharding import MeshResource
 
@@ -154,13 +154,15 @@ def compare_ops(
         grad_args = tuple(range(len(inputs)))
 
     target_grad_func = jax.value_and_grad(target_func, argnums=grad_args)
-    target_pjitter = pjit(target_grad_func, in_shardings=in_shardings, out_shardings=out_shardings)
-    target_fwd, target_grads = target_pjitter(*inputs, **kwargs)
-    target_hlo = target_pjitter.lower(*inputs, **kwargs).compile().as_text()
+    target_jitter = jax.jit(
+        target_grad_func, in_shardings=in_shardings, out_shardings=out_shardings
+    )
+    target_fwd, target_grads = target_jitter(*inputs, **kwargs)
+    target_hlo = target_jitter.lower(*inputs, **kwargs).compile().as_text()
 
     ref_grad_func = jax.value_and_grad(ref_func, argnums=grad_args)
-    ref_pjitter = pjit(ref_grad_func, in_shardings=in_shardings, out_shardings=out_shardings)
-    ref_fwd, ref_grads = ref_pjitter(*inputs, **kwargs)
+    ref_jitter = jax.jit(ref_grad_func, in_shardings=in_shardings, out_shardings=out_shardings)
+    ref_fwd, ref_grads = ref_jitter(*inputs, **kwargs)
 
     assert_allclose(target_fwd, ref_fwd, dtype=metric_fwd_dtype)
 
diff --git a/tests/jax/test_distributed_layernorm.py b/tests/jax/test_distributed_layernorm.py
index 977d010afd..d551b73905 100644
--- a/tests/jax/test_distributed_layernorm.py
+++ b/tests/jax/test_distributed_layernorm.py
@@ -134,9 +134,12 @@ def ref_func(x, gamma, beta):
         devices = np.asarray(jax.devices()[:device_count]).reshape(*mesh_shape)
         mesh = Mesh(devices, mesh_axes)
         with mesh, autocast(enabled=True, recipe=fp8_recipe, mesh_resource=mesh_resource):
-            x_ = jax.device_put(x, NamedSharding(mesh, x_pspec))
-            gamma_ = jax.device_put(gamma, NamedSharding(mesh, g_pspec))
-            beta_ = jax.device_put(beta, NamedSharding(mesh, b_pspec))
+            x_named_sharding = NamedSharding(mesh, x_pspec)
+            g_named_sharding = NamedSharding(mesh, g_pspec)
+            b_named_sharding = NamedSharding(mesh, b_pspec)
+            x_ = jax.device_put(x, x_named_sharding)
+            gamma_ = jax.device_put(gamma, g_named_sharding)
+            beta_ = jax.device_put(beta, b_named_sharding)
 
             with warnings.catch_warnings(record=True) as warns:
                 try:
@@ -148,8 +151,11 @@ def ref_func(x, gamma, beta):
                         grad_args=(0, 1, 2),
                         metric_fwd_dtype=q_dtype,
                         metric_bwd_dtype=q_dtype,
-                        in_shardings=(x_pspec, g_pspec, b_pspec),
-                        out_shardings=(None, (x_pspec, g_pspec, b_pspec)),
+                        in_shardings=(x_named_sharding, g_named_sharding, b_named_sharding),
+                        out_shardings=(
+                            None,
+                            (x_named_sharding, g_named_sharding, b_named_sharding),
+                        ),
                     )
                 except AssertionError as err:
                     # Layernorm should still produce the correct numerical result with
@@ -210,8 +216,10 @@ def ref_func(x, gamma):
         devices = np.asarray(jax.devices()[:device_count]).reshape(*mesh_shape)
         mesh = Mesh(devices, mesh_axes)
         with mesh, autocast(enabled=True, recipe=fp8_recipe, mesh_resource=mesh_resource):
-            x_ = jax.device_put(x, NamedSharding(mesh, x_pspec))
-            gamma_ = jax.device_put(gamma, NamedSharding(mesh, g_pspec))
+            x_named_sharding = NamedSharding(mesh, x_pspec)
+            g_named_sharding = NamedSharding(mesh, g_pspec)
+            x_ = jax.device_put(x, x_named_sharding)
+            gamma_ = jax.device_put(gamma, g_named_sharding)
 
             with warnings.catch_warnings(record=True) as warns:
                 try:
@@ -223,8 +231,8 @@ def ref_func(x, gamma):
                         grad_args=(0, 1),
                         metric_fwd_dtype=q_dtype,
                         metric_bwd_dtype=q_dtype,
-                        in_shardings=(x_pspec, g_pspec),
-                        out_shardings=(None, (x_pspec, g_pspec)),
+                        in_shardings=(x_named_sharding, g_named_sharding),
+                        out_shardings=(None, (x_named_sharding, g_named_sharding)),
                     )
                 except AssertionError as err:
                     # RmsNorm should still produce the correct numerical result with
diff --git a/tests/jax/test_distributed_softmax.py b/tests/jax/test_distributed_softmax.py
index 2bd4d862a6..f1ae6c9e49 100644
--- a/tests/jax/test_distributed_softmax.py
+++ b/tests/jax/test_distributed_softmax.py
@@ -103,8 +103,10 @@ def impl_test_softmax(
         devices = np.asarray(jax.devices()[:device_count]).reshape(*mesh_shape)
         mesh = Mesh(devices, mesh_axes)
         with mesh, autocast(mesh_resource=mesh_resource):
-            x_ = jax.device_put(x, NamedSharding(mesh, x_pspec))
-            mask_ = jax.device_put(mask, NamedSharding(mesh, mask_pspec))
+            x_named_sharding = NamedSharding(mesh, x_pspec)
+            mask_named_sharding = NamedSharding(mesh, mask_pspec)
+            x_ = jax.device_put(x, x_named_sharding)
+            mask_ = jax.device_put(mask, mask_named_sharding)
 
             with warnings.catch_warnings(record=True) as warns:
                 try:
@@ -116,8 +118,8 @@ def impl_test_softmax(
                         grad_args=(0,),
                         metric_fwd_dtype=dtype,
                         metric_bwd_dtype=dtype,
-                        in_shardings=(x_pspec, mask_pspec),
-                        out_shardings=(None, (x_pspec,)),
+                        in_shardings=(x_named_sharding, mask_named_sharding),
+                        out_shardings=(None, x_named_sharding),
                     )
                 except AssertionError as err:
                     # Softmax should still produce the correct numerical result with

From c2a643d50b91ce885f5f1b1bd144651bc86dff22 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Sat, 18 Oct 2025 00:00:01 -0400
Subject: [PATCH 03/59] Wheels for cuda 13 (#2278)

* Support wheel build for cuda 13

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fixes

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fixes for cu13 runtime, format

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Add documentation

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Better error handling

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* fix

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* fix jax sdist

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Modify function names

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 README.rst                               |   2 +-
 build_tools/wheel_utils/Dockerfile.aarch |  29 ++++--
 build_tools/wheel_utils/Dockerfile.x86   |  29 ++++--
 build_tools/wheel_utils/build_wheels.sh  |  18 ++--
 build_tools/wheel_utils/launch_aarch.sh  |  28 ++++-
 build_tools/wheel_utils/launch_x86.sh    |  28 ++++-
 docs/installation.rst                    |   8 ++
 setup.py                                 |   5 +-
 transformer_engine/common/__init__.py    | 124 ++++++++++++++++-------
 transformer_engine/jax/setup.py          |  32 +++++-
 transformer_engine/pytorch/setup.py      |  14 ++-
 11 files changed, 243 insertions(+), 74 deletions(-)

diff --git a/README.rst b/README.rst
index 9b65c60ae8..50c1dcd807 100644
--- a/README.rst
+++ b/README.rst
@@ -205,7 +205,7 @@ pip Installation
 **Prerequisites for pip installation:**
 
 * A compatible C++ compiler
-* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) installed
+* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) if installing from source.
 
 To install the latest stable version with pip:
 
diff --git a/build_tools/wheel_utils/Dockerfile.aarch b/build_tools/wheel_utils/Dockerfile.aarch
index 223c4a7f1c..404cb941cb 100644
--- a/build_tools/wheel_utils/Dockerfile.aarch
+++ b/build_tools/wheel_utils/Dockerfile.aarch
@@ -7,23 +7,34 @@ FROM quay.io/pypa/manylinux_2_28_aarch64
 WORKDIR /TransformerEngine/
 COPY ../.. /TransformerEngine/
 
-ARG VER="12-3"
-ARG ARCH="aarch64"
-RUN dnf -y install vim
+ARG CUDA_MAJOR="12"
+ARG CUDA_MINOR="3"
+
+# Args for build_wheels.sh
+ARG BUILD_METAPACKAGE=true
+ARG BUILD_COMMON=true
+ARG BUILD_PYTORCH=true
+ARG BUILD_JAX=true
+ENV BUILD_METAPACKAGE=${BUILD_METAPACKAGE}
+ENV BUILD_COMMON=${BUILD_COMMON}
+ENV BUILD_PYTORCH=${BUILD_PYTORCH}
+ENV BUILD_JAX=${BUILD_JAX}
+ENV CUDA_MAJOR=${CUDA_MAJOR}
 
 # Cuda toolkit, cudnn, driver.
 RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 RUN dnf -y install epel-release
-RUN dnf -y install cuda-compiler-${VER}.${ARCH} \
-                   cuda-libraries-${VER}.${ARCH} \
-                   cuda-libraries-devel-${VER}.${ARCH}
-RUN dnf -y install --allowerasing cudnn9-cuda-12
+RUN dnf -y install cuda-compiler-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64 \
+                   cuda-libraries-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64 \
+                   cuda-libraries-devel-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64
+RUN dnf -y install --allowerasing cudnn9-cuda-${CUDA_MAJOR}
 RUN dnf clean all
 RUN rm -rf /var/cache/dnf/*
 RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
-RUN dnf -y install cuda-toolkit
+RUN dnf -y install cuda-toolkit-${CUDA_MAJOR}
 RUN dnf clean all
 RUN dnf -y install glog.aarch64 glog-devel.aarch64
+RUN dnf -y install libnccl libnccl-devel libnccl-static
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
 ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
@@ -33,4 +44,4 @@ ENV CUDA_PATH=/usr/local/cuda
 ENV CUDADIR=/usr/local/cuda
 ENV NVTE_RELEASE_BUILD=1
 
-CMD ["/bin/bash", "/TransformerEngine/build_tools/wheel_utils/build_wheels.sh", "manylinux_2_28_aarch64", "true", "true", "false", "false", "false"]
+CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_aarch64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"]
diff --git a/build_tools/wheel_utils/Dockerfile.x86 b/build_tools/wheel_utils/Dockerfile.x86
index 26122eed9b..daa7f961cd 100644
--- a/build_tools/wheel_utils/Dockerfile.x86
+++ b/build_tools/wheel_utils/Dockerfile.x86
@@ -7,23 +7,34 @@ FROM quay.io/pypa/manylinux_2_28_x86_64
 WORKDIR /TransformerEngine/
 COPY ../.. /TransformerEngine/
 
-ARG VER="12-3"
-ARG ARCH="x86_64"
-RUN dnf -y install vim
+ARG CUDA_MAJOR="12"
+ARG CUDA_MINOR="3"
+
+# Args for build_wheels.sh
+ARG BUILD_METAPACKAGE=true
+ARG BUILD_COMMON=true
+ARG BUILD_PYTORCH=true
+ARG BUILD_JAX=true
+ENV BUILD_METAPACKAGE=${BUILD_METAPACKAGE}
+ENV BUILD_COMMON=${BUILD_COMMON}
+ENV BUILD_PYTORCH=${BUILD_PYTORCH}
+ENV BUILD_JAX=${BUILD_JAX}
+ENV CUDA_MAJOR=${CUDA_MAJOR}
 
 # Cuda toolkit, cudnn, driver.
 RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 RUN dnf -y install epel-release
-RUN dnf -y install cuda-compiler-${VER}.${ARCH} \
-                   cuda-libraries-${VER}.${ARCH} \
-                   cuda-libraries-devel-${VER}.${ARCH}
-RUN dnf -y install --allowerasing cudnn9-cuda-12
+RUN dnf -y install cuda-compiler-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64 \
+                   cuda-libraries-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64 \
+                   cuda-libraries-devel-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64
+RUN dnf -y install --allowerasing cudnn9-cuda-${CUDA_MAJOR}
 RUN dnf clean all
 RUN rm -rf /var/cache/dnf/*
 RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
-RUN dnf -y install cuda-toolkit
+RUN dnf -y install cuda-toolkit-${CUDA_MAJOR}
 RUN dnf clean all
 RUN dnf -y install glog.x86_64 glog-devel.x86_64
+RUN dnf -y install libnccl libnccl-devel libnccl-static
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
 ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
@@ -33,4 +44,4 @@ ENV CUDA_PATH=/usr/local/cuda
 ENV CUDADIR=/usr/local/cuda
 ENV NVTE_RELEASE_BUILD=1
 
-CMD ["/bin/bash", "/TransformerEngine/build_tools/wheel_utils/build_wheels.sh", "manylinux_2_28_x86_64", "true", "true", "true", "true", "true"]
+CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_x86_64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"]
\ No newline at end of file
diff --git a/build_tools/wheel_utils/build_wheels.sh b/build_tools/wheel_utils/build_wheels.sh
index bf4f9d2bc2..954a8f1c67 100644
--- a/build_tools/wheel_utils/build_wheels.sh
+++ b/build_tools/wheel_utils/build_wheels.sh
@@ -9,8 +9,10 @@ BUILD_METAPACKAGE=${2:-true}
 BUILD_COMMON=${3:-true}
 BUILD_PYTORCH=${4:-true}
 BUILD_JAX=${5:-true}
+CUDA_MAJOR=${6:-12}
 
 export NVTE_RELEASE_BUILD=1
+export PIP_CONSTRAINT=""
 export TARGET_BRANCH=${TARGET_BRANCH:-}
 mkdir -p /wheelhouse/logs
 
@@ -21,7 +23,7 @@ git checkout $TARGET_BRANCH
 git submodule update --init --recursive
 
 # Install deps
-/opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja
+/opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja setuptools wheel nvidia-mathdx==25.1.1
 
 if $BUILD_METAPACKAGE ; then
         cd /TransformerEngine
@@ -36,32 +38,32 @@ if $BUILD_COMMON ; then
         # Create the wheel.
         /opt/python/cp310-cp310/bin/python setup.py bdist_wheel --verbose --python-tag=py3 --plat-name=$PLATFORM 2>&1 | tee /wheelhouse/logs/common.txt
 
-        # Repack the wheel for cuda specific package, i.e. cu12.
+        # Repack the wheel for specific cuda version.
         /opt/python/cp310-cp310/bin/wheel unpack dist/*
         # From python 3.10 to 3.11, the package name delimiter in metadata got changed from - (hyphen) to _ (underscore).
-        sed -i "s/Name: transformer-engine/Name: transformer-engine-cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
-        sed -i "s/Name: transformer_engine/Name: transformer_engine_cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
-        mv "${WHL_BASE}/${WHL_BASE}.dist-info" "${WHL_BASE}/transformer_engine_cu12-${VERSION}.dist-info"
+        sed -i "s/Name: transformer-engine/Name: transformer-engine-cu${CUDA_MAJOR}/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
+        sed -i "s/Name: transformer_engine/Name: transformer_engine_cu${CUDA_MAJOR}/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
+        mv "${WHL_BASE}/${WHL_BASE}.dist-info" "${WHL_BASE}/transformer_engine_cu${CUDA_MAJOR}-${VERSION}.dist-info"
         /opt/python/cp310-cp310/bin/wheel pack ${WHL_BASE}
 
         # Rename the wheel to make it python version agnostic.
         whl_name=$(basename dist/*)
         IFS='-' read -ra whl_parts <<< "$whl_name"
-        whl_name_target="${whl_parts[0]}_cu12-${whl_parts[1]}-py3-none-${whl_parts[4]}"
+        whl_name_target="${whl_parts[0]}_cu${CUDA_MAJOR}-${whl_parts[1]}-py3-none-${whl_parts[4]}"
         rm -rf $WHL_BASE dist
         mv *.whl /wheelhouse/"$whl_name_target"
 fi
 
 if $BUILD_PYTORCH ; then
 	cd /TransformerEngine/transformer_engine/pytorch
-        /opt/python/cp310-cp310/bin/pip install torch
+	/opt/python/cp310-cp310/bin/pip install torch
 	/opt/python/cp310-cp310/bin/python setup.py sdist 2>&1 | tee /wheelhouse/logs/torch.txt
 	cp dist/* /wheelhouse/
 fi
 
 if $BUILD_JAX ; then
 	cd /TransformerEngine/transformer_engine/jax
-        /opt/python/cp310-cp310/bin/pip install "jax[cuda12_local]" jaxlib
+	/opt/python/cp310-cp310/bin/pip install "jax[cuda${CUDA_MAJOR}_local]" jaxlib
 	/opt/python/cp310-cp310/bin/python setup.py sdist 2>&1 | tee /wheelhouse/logs/jax.txt
 	cp dist/* /wheelhouse/
 fi
diff --git a/build_tools/wheel_utils/launch_aarch.sh b/build_tools/wheel_utils/launch_aarch.sh
index 04e3cd6916..85f754ca19 100644
--- a/build_tools/wheel_utils/launch_aarch.sh
+++ b/build_tools/wheel_utils/launch_aarch.sh
@@ -2,7 +2,29 @@
 #
 # See LICENSE for license information.
 
-docker build --no-cache -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
+# Remove leftovers.
+rm -rf aarch_wheelhouse_cu12 aarch_wheelhouse_cu13
+
+# CUDA 12.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=12 \
+  --build-arg CUDA_MINOR=3 \
+  --build-arg BUILD_METAPACKAGE=false \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=false \
+  --build-arg BUILD_JAX=false \
+  -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
+docker run --runtime=nvidia --gpus=all --ipc=host "aarch_wheel"
+docker cp $(docker ps -aq | head -1):/wheelhouse aarch_wheelhouse_cu12
+
+# CUDA 13.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=13 \
+  --build-arg CUDA_MINOR=0 \
+  --build-arg BUILD_METAPACKAGE=false \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=false \
+  --build-arg BUILD_JAX=false \
+  -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
 docker run --runtime=nvidia --gpus=all --ipc=host "aarch_wheel"
-rm -rf aarch_wheelhouse
-docker cp $(docker ps -aq | head -1):/wheelhouse/ aarch_wheelhouse
+docker cp $(docker ps -aq | head -1):/wheelhouse aarch_wheelhouse_cu13
diff --git a/build_tools/wheel_utils/launch_x86.sh b/build_tools/wheel_utils/launch_x86.sh
index b0d20be3f4..11fc522947 100644
--- a/build_tools/wheel_utils/launch_x86.sh
+++ b/build_tools/wheel_utils/launch_x86.sh
@@ -2,7 +2,29 @@
 #
 # See LICENSE for license information.
 
-docker build --no-cache -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
+# Remove leftovers.
+rm -rf x86_wheelhouse_cu12 x86_wheelhouse_cu13
+
+# CUDA 12.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=12 \
+  --build-arg CUDA_MINOR=3 \
+  --build-arg BUILD_METAPACKAGE=true \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=true \
+  --build-arg BUILD_JAX=true \
+  -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
+docker run --runtime=nvidia --gpus=all --ipc=host "x86_wheel"
+docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse_cu12
+
+# CUDA 13.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=13 \
+  --build-arg CUDA_MINOR=0 \
+  --build-arg BUILD_METAPACKAGE=false \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=false \
+  --build-arg BUILD_JAX=false \
+  -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
 docker run --runtime=nvidia --gpus=all --ipc=host "x86_wheel"
-rm -rf x86_wheelhouse
-docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse
+docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse_cu13
diff --git a/docs/installation.rst b/docs/installation.rst
index ecb1e9a0dd..a8bb74fd1a 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -38,6 +38,14 @@ Transformer Engine can be directly installed from `our PyPI <https://pypi.org/pr
 
 To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch]). Transformer Engine ships wheels for the core library. Source distributions are shipped for the JAX and PyTorch extensions.
 
+The core package from Transformer Engine (without any framework extensions) can be installed via:
+
+.. code-block:: bash
+
+    pip3 install transformer_engine[core]
+
+By default, this will install the core library compiled for CUDA 12. The cuda major version can be specified by modified the extra dependency to `core_cu12` or `core_cu13`.
+
 pip - from GitHub
 -----------------------
 
diff --git a/setup.py b/setup.py
index c932da5e02..a820265c30 100644
--- a/setup.py
+++ b/setup.py
@@ -140,8 +140,11 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
         ext_modules = []
         package_data = {}
         include_package_data = False
-        install_requires = ([f"transformer_engine_cu12=={__version__}"],)
+        install_requires = []
         extras_require = {
+            "core": [f"transformer_engine_cu12=={__version__}"],
+            "core_cu12": [f"transformer_engine_cu12=={__version__}"],
+            "core_cu13": [f"transformer_engine_cu13=={__version__}"],
             "pytorch": [f"transformer_engine_torch=={__version__}"],
             "jax": [f"transformer_engine_jax=={__version__}"],
         }
diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index dd1ec480b2..5e1318cf86 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -8,22 +8,18 @@
 import functools
 import glob
 import importlib
-from importlib.metadata import version, metadata, PackageNotFoundError
-import logging
+from importlib.metadata import version, distribution, PackageNotFoundError
 import os
 from pathlib import Path
 import platform
 import subprocess
 import sys
 import sysconfig
-from typing import Optional
-
-
-_logger = logging.getLogger(__name__)
+from typing import Optional, Tuple
 
 
 @functools.lru_cache(maxsize=None)
-def _is_pip_package_installed(package) -> bool:
+def _is_package_installed(package) -> bool:
     """Check if the given package is installed via pip."""
 
     # This is needed because we only want to return true
@@ -31,12 +27,34 @@ def _is_pip_package_installed(package) -> bool:
     # if it's importable in the current directory due to
     # the presence of the shared library module.
     try:
-        metadata(package)
+        distribution(package)
     except PackageNotFoundError:
         return False
     return True
 
 
+@functools.lru_cache(maxsize=None)
+def _is_package_installed_from_wheel(package) -> bool:
+    """Check if the given package is installed via PyPI."""
+
+    if not _is_package_installed(package):
+        return False
+
+    te_dist = distribution(package)
+    te_wheel_file = ""
+    for file_path in te_dist.files:
+        if file_path.name == "WHEEL":
+            te_wheel_file = te_dist.locate_file("") / file_path
+    if not te_wheel_file:
+        return False
+
+    with te_wheel_file.open("r") as f:
+        for line in f:
+            if line.startswith("Root-Is-Purelib:"):
+                return line.strip().split(":")[1].strip().lower() == "true"
+    return False
+
+
 @functools.lru_cache(maxsize=None)
 def _find_shared_object_in_te_dir(te_path: Path, prefix: str) -> Optional[Path]:
     """
@@ -112,6 +130,19 @@ def _get_shared_object_file(library: str) -> Path:
     )
 
 
+def get_te_core_package_info() -> Tuple[bool, str, str]:
+    """
+    Check if Tranformer Engine core package is installed.
+    Returns the module name and version if found.
+    """
+
+    te_core_packages = ("transformer-engine-cu12", "transformer-engine-cu13")
+    for package in te_core_packages:
+        if _is_package_installed(package):
+            return True, package, version(package)
+    return False, "", ""
+
+
 @functools.lru_cache(maxsize=None)
 def load_framework_extension(framework: str) -> None:
     """
@@ -130,39 +161,30 @@ def load_framework_extension(framework: str) -> None:
     if framework == "torch":
         extra_dep_name = "pytorch"
 
+    # Find the TE packages. The core and framework packages can only be installed via PyPI.
+    # For the `transformer-engine` package, we need to check explicity.
+    te_core_installed, te_core_package_name, te_core_version = get_te_core_package_info()
+    te_framework_installed = _is_package_installed(module_name)
+    te_installed = _is_package_installed("transformer_engine")
+    te_installed_via_pypi = _is_package_installed_from_wheel("transformer_engine")
+
+    assert te_installed, "Could not find `transformer_engine`."
+
     # If the framework extension pip package is installed, it means that TE is installed via
     # PyPI. For this case we need to make sure that the metapackage, the core lib, and framework
-    # extension are all installed via PyPI and have matching version.
-    if _is_pip_package_installed(module_name):
-        assert _is_pip_package_installed(
-            "transformer_engine"
-        ), "Could not find `transformer-engine`."
-        assert _is_pip_package_installed(
-            "transformer_engine_cu12"
-        ), "Could not find `transformer-engine-cu12`."
-        assert (
-            version(module_name)
-            == version("transformer-engine")
-            == version("transformer-engine-cu12")
-        ), (
-            "TransformerEngine package version mismatch. Found"
+    # extension are all installed via PyPI and have matching versions.
+    if te_framework_installed:
+        assert te_installed_via_pypi, "Could not find `transformer-engine` PyPI package."
+        assert te_core_installed, "Could not find TE core package `transformer-engine-cu*`."
+
+        assert version(module_name) == version("transformer-engine") == te_core_version, (
+            "Transformer Engine package version mismatch. Found"
             f" {module_name} v{version(module_name)}, transformer-engine"
-            f" v{version('transformer-engine')}, and transformer-engine-cu12"
-            f" v{version('transformer-engine-cu12')}. Install transformer-engine using "
-            f"'pip3 install transformer-engine[{extra_dep_name}]==VERSION'"
+            f" v{version('transformer-engine')}, and {te_core_package_name}"
+            f" v{te_core_version}. Install transformer-engine using "
+            f"'pip3 install --no-build-isolation transformer-engine[{extra_dep_name}]==VERSION'"
         )
 
-    # If the core package is installed via PyPI, log if
-    # the framework extension is not found from PyPI.
-    # Note: Should we error? This is a rare use case.
-    if _is_pip_package_installed("transformer-engine-cu12"):
-        if not _is_pip_package_installed(module_name):
-            _logger.info(
-                "Could not find package %s. Install transformer-engine using "
-                f"'pip3 install transformer-engine[{extra_dep_name}]==VERSION'",
-                module_name,
-            )
-
     # After all checks are completed, load the shared object file.
     spec = importlib.util.spec_from_file_location(module_name, _get_shared_object_file(framework))
     solib = importlib.util.module_from_spec(spec)
@@ -170,6 +192,35 @@ def load_framework_extension(framework: str) -> None:
     spec.loader.exec_module(solib)
 
 
+def sanity_checks_for_pypi_installation() -> None:
+    """Ensure that package is installed correctly if using PyPI."""
+
+    te_core_installed, te_core_package_name, te_core_version = get_te_core_package_info()
+    te_installed = _is_package_installed("transformer_engine")
+    te_installed_via_pypi = _is_package_installed_from_wheel("transformer_engine")
+
+    assert te_installed, "Could not find `transformer-engine`."
+
+    # If the core package is installed via PyPI.
+    if te_core_installed:
+        assert te_installed_via_pypi, "Could not find `transformer-engine` PyPI package."
+        assert version("transformer-engine") == te_core_version, (
+            "Transformer Engine package version mismatch. Found "
+            f"transformer-engine v{version('transformer-engine')} "
+            f"and {te_core_package_name} v{te_core_version}."
+        )
+
+    # Only the metapackage is found, invalid usecase.
+    elif te_installed_via_pypi:
+        raise RuntimeError(
+            "Found empty `transformer-engine` meta package installed. "
+            "Install `transformer-engine` with framework extensions via"
+            "'pip3 install --no-build-isolation transformer-engine[pytorch,jax]==VERSION'"
+            " or 'pip3 install transformer-engine[core]` for the TE core lib only. The `core_cu12`"
+            " or `core_cu13` extra deps can be used to specify CUDA version for the TE core lib."
+        )
+
+
 @functools.lru_cache(maxsize=None)
 def _get_sys_extension() -> str:
     """File extension for shared objects."""
@@ -338,6 +389,7 @@ def _load_core_library():
 
 
 if "NVTE_PROJECT_BUILDING" not in os.environ or bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
+    sanity_checks_for_pypi_installation()
     _CUDNN_LIB_CTYPES = _load_cudnn()
     _NVRTC_LIB_CTYPES = _load_nvrtc()
     _CURAND_LIB_CTYPES = _load_curand()
diff --git a/transformer_engine/jax/setup.py b/transformer_engine/jax/setup.py
index f83375d821..ccdbcdb529 100644
--- a/transformer_engine/jax/setup.py
+++ b/transformer_engine/jax/setup.py
@@ -54,6 +54,26 @@
 CMakeBuildExtension = get_build_ext(BuildExtension, True)
 
 
+def get_cuda_major_version() -> int:
+    """Get CUDA major version using Jax backend."""
+
+    assert (
+        jax._src.lib.cuda_versions is not None
+    ), "GPU backend is required to build TE jax extensions."
+
+    # Jax currently does not have any stable/public method to get cuda version.
+    # Try using internal function and default to cuda12 if not found.
+    try:
+        cuda_version = jax._src.lib.cuda_versions.cuda_runtime_get_version()
+        cuda_major_version = cuda_version // 1000
+    except AttributeError:
+        cuda_version = os.getenv("CUDA_VERSION", "12")
+        cuda_major_version = int(cuda_version.split(".")[0])
+
+    assert cuda_major_version in (12, 13), f"Unsupported cuda version {cuda_version}."
+    return cuda_major_version
+
+
 if __name__ == "__main__":
     """Main entry point for JAX extension installation.
 
@@ -93,15 +113,23 @@
         )
     ]
 
+    # Setup version and requirements.
+    # Having the framework extension depend on the core lib allows
+    # us to detect CUDA version dynamically during compilation and
+    # choose the correct wheel for te core lib.
+    __version__ = te_version()
+    te_core = f"transformer_engine_cu{get_cuda_major_version()}=={__version__}"
+    install_requires = install_requirements() + [te_core]
+
     # Configure package
     setuptools.setup(
         name="transformer_engine_jax",
-        version=te_version(),
+        version=__version__,
         description="Transformer acceleration library - Jax Lib",
         ext_modules=ext_modules,
         cmdclass={"build_ext": CMakeBuildExtension},
         python_requires=f">={min_python_version_str()}",
-        install_requires=install_requirements(),
+        install_requires=install_requires,
         tests_require=test_requirements(),
     )
     if any(x in sys.argv for x in (".", "sdist", "bdist_wheel")):
diff --git a/transformer_engine/pytorch/setup.py b/transformer_engine/pytorch/setup.py
index 08870040f3..7a81550047 100644
--- a/transformer_engine/pytorch/setup.py
+++ b/transformer_engine/pytorch/setup.py
@@ -145,15 +145,25 @@ def run(self):
         )
     ]
 
+    # Setup version and requirements.
+    # Having the framework extension depend on the core lib allows
+    # us to detect CUDA version dynamically during compilation and
+    # choose the correct wheel for te core lib.
+    __version__ = te_version()
+    cuda_major_version = parse(torch.version.cuda).major
+    assert cuda_major_version in (12, 13), f"Unsupported cuda version {torch.version.cuda}."
+    te_core = f"transformer_engine_cu{cuda_major_version}=={__version__}"
+    install_requires = install_requirements() + [te_core]
+
     # Configure package
     setuptools.setup(
         name=PACKAGE_NAME,
-        version=te_version(),
+        version=__version__,
         description="Transformer acceleration library - Torch Lib",
         ext_modules=ext_modules,
         cmdclass={"build_ext": CMakeBuildExtension, "bdist_wheel": CachedWheelsCommand},
         python_requires=f">={min_python_version_str()}",
-        install_requires=install_requirements(),
+        install_requires=install_requires,
         tests_require=test_requirements(),
     )
     if any(x in sys.argv for x in (".", "sdist", "bdist_wheel")):

From 7e72d41161f36e1ef8b0f01db7ed5fd85338d644 Mon Sep 17 00:00:00 2001
From: jberchtold-nvidia <158520091+jberchtold-nvidia@users.noreply.github.com>
Date: Wed, 22 Oct 2025 08:51:36 -0700
Subject: [PATCH 04/59] [JAX] NVFP4 recipe with option to enable/disable SR,
 RHT, and 2D quantization (#2270)

* [JAX] Support recipe flags for disabling SR, RHT, and 2D quantization

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* lint

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* Fix issue with SR state being erased due to pytree handling of NVFP4Quantizer

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* Add test for SR state preservation across VJP boundaries

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* Fix sharding of SR rng state

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* lint

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* update tolerances slightly now that SR is enabled

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* lint

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* Use hashlib for deterministic hashes across runs for SR

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* rename uses_rht on scaled tensors to has_applied_rht

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* add assert

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* Move decision of whether to use RHT into helper.py and add dedicated RHT tests

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* lint

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* fix use_rht attr usage

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* fix pure-jax rht usage criteria

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

* Adjust tolerances after rebase

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

---------

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>
---
 .../encoder/test_multiprocessing_encoder.py   |   4 +-
 .../jax/encoder/test_single_gpu_encoder.py    |   2 +-
 tests/jax/test_custom_call_compute.py         | 155 ++++++++++++------
 tests/jax/test_helper.py                      |  82 ++++++++-
 transformer_engine/jax/cpp_extensions/gemm.py |  16 +-
 .../jax/cpp_extensions/quantization.py        |  40 +++--
 .../jax/quantize/dequantizer.py               |  11 +-
 transformer_engine/jax/quantize/hadamard.py   |  26 ---
 transformer_engine/jax/quantize/helper.py     |  90 +++++++---
 transformer_engine/jax/quantize/metadata.py   |  20 +++
 transformer_engine/jax/quantize/quantizer.py  |  34 +++-
 transformer_engine/jax/quantize/tensor.py     |  25 +++
 transformer_engine/jax/sharding.py            |  13 ++
 13 files changed, 382 insertions(+), 136 deletions(-)

diff --git a/examples/jax/encoder/test_multiprocessing_encoder.py b/examples/jax/encoder/test_multiprocessing_encoder.py
index 7e708466c2..bd0ec94b0a 100644
--- a/examples/jax/encoder/test_multiprocessing_encoder.py
+++ b/examples/jax/encoder/test_multiprocessing_encoder.py
@@ -670,7 +670,7 @@ def test_te_mxfp8(self):
     def test_te_nvfp4(self):
         """Test Transformer Engine with NVFP4"""
         result = self.exec(True, "NVFP4BlockScaling")
-        assert result[0] < 0.451 and result[1] > 0.79
+        assert result[0] < 0.451 and result[1] > 0.788
 
     @unittest.skipIf(not is_bf16_supported(), "Device compute capability 8.0+ is required for BF16")
     def test_te_bf16_shardy(self):
@@ -708,7 +708,7 @@ def test_te_mxfp8_shardy(self):
     def test_te_nvfp4_shardy(self):
         """Test Transformer Engine with NVFP4"""
         result = self.exec(True, "NVFP4BlockScaling", enable_shardy=True)
-        assert result[0] < 0.451 and result[1] > 0.79
+        assert result[0] < 0.451 and result[1] > 0.788
 
 
 if __name__ == "__main__":
diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py
index 79178485c2..2b725ee71d 100644
--- a/examples/jax/encoder/test_single_gpu_encoder.py
+++ b/examples/jax/encoder/test_single_gpu_encoder.py
@@ -385,7 +385,7 @@ def test_te_nvfp4(self):
         self.args.use_fp8 = True
         self.args.fp8_recipe = "NVFP4BlockScaling"
         actual = train_and_evaluate(self.args)
-        assert actual[0] < 0.476 and actual[1] > 0.775
+        assert actual[0] < 0.477 and actual[1] > 0.769
 
 
 if __name__ == "__main__":
diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
index 2934e48df1..1217ebf65f 100644
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -40,7 +40,6 @@
     QuantizerFactory,
     QuantizeLayout,
     noop_quantizer_set,
-    should_use_rht,
 )
 from transformer_engine.jax.quantize import helper
 from transformer_engine.jax.activation import activation
@@ -685,21 +684,14 @@ class TestQuantize:
     Purely quantization related tests that will always test on a wider set of types and shapes
     """
 
-    def _skip_for_fp4(self, input_shape, q_dtype, scaling_mode, q_layout, flatten_axis):
-        """Temporary hack to skip unsupported FP4 cases until we implement them"""
+    def _skip_unsupported_dtypes(self, q_dtype, scaling_mode):
+        """Skip unsupported dtypes for given scaling mode. For example, NVFP4 only supports the float4_e2m1 dtype not float8 dtypes."""
         if q_dtype not in scaling_mode.get_compatible_q_dtypes():
             pytest.skip(f"Quantize dtype {q_dtype} is not supported by {scaling_mode}")
             return
 
-        # HACK: FIXME TODO(jberchtold)
-        row = reduce(operator.mul, input_shape[flatten_axis:], 1)
-        col = reduce(operator.mul, input_shape[:flatten_axis], 1)
-        will_use_rht = should_use_rht(scaling_mode, q_layout=q_layout)
-        if will_use_rht and (row % 64 != 0 or col % 128 != 0):
-            pytest.skip("Unfused RHT is not supported currently, skipping")
-
     def test_qdq(self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout, flatten_axis):
-        self._skip_for_fp4(input_shape, q_dtype, scaling_mode, q_layout, flatten_axis)
+        self._skip_unsupported_dtypes(q_dtype, scaling_mode)
 
         key = jax.random.PRNGKey(0)
 
@@ -780,22 +772,8 @@ def test_qdq(self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout, flatt
             assert_dequantized_scaled_tensor(scaled_tensor, x)
 
     def _should_use_precise_comparison(
-        self, in_dtype, scaling_mode, q_layout, input_shape, flatten_axis
+        self, in_dtype, scaling_mode, quantizer, input_shape, flatten_axis
     ):
-        # TODO(jberchtold): Remove this hack once we have a better solution to ensure bitwise identical results between TE and JAX RHT+quant implementations. Currently for certain shapes the quantized fp4 data differs by a small amount on <0.5% of the values.
-        RHT_SLIGHT_MISMATCH_SHAPES = [
-            ((32, 256, 128), -1),
-            ((64, 32, 32, 256), -1),
-            ((8192, 2, 4096), -2),
-        ]
-
-        if (
-            should_use_rht(scaling_mode, q_layout=q_layout)
-            and (input_shape, flatten_axis) in RHT_SLIGHT_MISMATCH_SHAPES
-        ):
-            # TE fused RHT+quant and JAX RHT+quant have slight implementation differences which can lead to small numerical differences on certain shapes
-            return False
-
         if scaling_mode.is_nvfp4_scaling and in_dtype != jnp.bfloat16:
             # With NVFP4 scaling, TE kernels internally use bfloat16 so using a different input dtype can lead to small numerical differences compared to the JAX implementation
             return False
@@ -805,7 +783,7 @@ def _should_use_precise_comparison(
     def test_quantize_bitwise(
         self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout, flatten_axis
     ):
-        self._skip_for_fp4(input_shape, q_dtype, scaling_mode, q_layout, flatten_axis)
+        self._skip_unsupported_dtypes(q_dtype, scaling_mode)
 
         key = jax.random.PRNGKey(0)
         input = jax.random.uniform(key, input_shape, in_dtype)
@@ -816,28 +794,20 @@ def test_quantize_bitwise(
 
         jax_output = _jax_quantize(input, quantizer=jax_quantizer, flatten_axis=flatten_axis)
 
-        try:
-            te_output = tex.quantize(input, quantizer=te_quantizer, flatten_axis=flatten_axis)
-        except AssertionError as e:
-            if should_use_rht(scaling_mode, q_layout=q_layout) and in_dtype != jnp.bfloat16:
-                error_message = e.args[0]
-                if "RHT requires input to be bfloat16" in error_message:
-                    # Successfully caught the expected error, early return from the test
-                    return
-            raise e
+        te_output = tex.quantize(input, quantizer=te_quantizer, flatten_axis=flatten_axis)
 
         assert_bitwise_scaled_tensors(
             te_output,
             jax_output,
             precise_comparison=self._should_use_precise_comparison(
-                in_dtype, scaling_mode, q_layout, input_shape, flatten_axis
+                in_dtype, scaling_mode, te_quantizer, input_shape, flatten_axis
             ),
         )
 
     def test_quantize_bitwise_jitted(
         self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout, flatten_axis
     ):
-        self._skip_for_fp4(input_shape, q_dtype, scaling_mode, q_layout, flatten_axis)
+        self._skip_unsupported_dtypes(q_dtype, scaling_mode)
 
         key = jax.random.PRNGKey(0)
         input = jax.random.uniform(key, input_shape, in_dtype)
@@ -851,21 +821,13 @@ def test_quantize_bitwise_jitted(
 
         jax_output = jax_impl_func_jit(input, quantizer=jax_quantizer, flatten_axis=flatten_axis)
 
-        try:
-            te_output = te_impl_func_jit(input, quantizer=te_quantizer, flatten_axis=flatten_axis)
-        except AssertionError as e:
-            if should_use_rht(scaling_mode, q_layout=q_layout) and in_dtype != jnp.bfloat16:
-                error_message = e.args[0]
-                if "RHT requires input to be bfloat16" in error_message:
-                    # Successfully caught the expected error, early return from the test
-                    return
-            raise e
+        te_output = te_impl_func_jit(input, quantizer=te_quantizer, flatten_axis=flatten_axis)
 
         assert_bitwise_scaled_tensors(
             te_output,
             jax_output,
             precise_comparison=self._should_use_precise_comparison(
-                in_dtype, scaling_mode, q_layout, input_shape, flatten_axis
+                in_dtype, scaling_mode, te_quantizer, input_shape, flatten_axis
             ),
         )
 
@@ -985,12 +947,6 @@ def _test_sr(
 
     def test_sr_nvfp4(self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout, flatten_axis):
         """Tests that the mean absolute error of stochastic rounding is smaller than round nearest quantization over multiple samples for both TE and JAX implementations. Asserts that the MAE of both implementations is close to each other."""
-        # HACK: FIXME TODO(jberchtold)
-        row = reduce(operator.mul, input_shape[flatten_axis:], 1)
-        col = reduce(operator.mul, input_shape[:flatten_axis], 1)
-        will_use_rht = should_use_rht(scaling_mode, q_layout=q_layout)
-        if will_use_rht and (row % 64 != 0 or col % 128 != 0):
-            pytest.skip("Unfused RHT is not supported currently, skipping")
 
         key = jax.random.PRNGKey(0)
         inputs = jax.random.uniform(key, input_shape, in_dtype)
@@ -1007,6 +963,97 @@ def test_sr_nvfp4(self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout,
         assert_allclose(te_mean_error, jax_mean_error, rtol=0.2, atol=1e-4)
 
 
+@pytest_parametrize_wrapper("in_dtype", [jnp.bfloat16])
+@pytest_parametrize_wrapper("q_dtype", [jnp.float4_e2m1fn])
+@pytest_parametrize_wrapper(
+    "scaling_mode", [s for s in supported_scaling_modes if s == ScalingMode.NVFP4_1D_SCALING]
+)
+class TestRandomizedHadamardTransform:
+
+    @pytest_parametrize_wrapper(
+        "q_layout", [QuantizeLayout.ROWWISE_COLWISE, QuantizeLayout.COLWISE]
+    )
+    @pytest_parametrize_wrapper("input_shape,flatten_axis", [((64, 128), -1)])
+    def test_rht_quantize_bitwise_jitted(
+        self, in_dtype, q_dtype, scaling_mode, q_layout, input_shape, flatten_axis
+    ):
+        key = jax.random.PRNGKey(0)
+        inputs = jax.random.uniform(key, input_shape, in_dtype)
+
+        te_quantizer, jax_quantizer = QuantizerFactory.create(
+            n_quantizers=2,
+            q_dtype=q_dtype,
+            scaling_mode=scaling_mode,
+            q_layout=q_layout,
+            use_rht=True,
+        )
+
+        jax_impl_func_jit = jax.jit(_jax_quantize, static_argnums=(2, 3))
+        te_impl_func_jit = jax.jit(tex.quantize, static_argnums=(2,))
+
+        jax_output = jax_impl_func_jit(inputs, quantizer=jax_quantizer, flatten_axis=flatten_axis)
+
+        te_output = te_impl_func_jit(inputs, quantizer=te_quantizer, flatten_axis=flatten_axis)
+
+        assert_bitwise_scaled_tensors(te_output, jax_output)
+
+    def _ref_gemm_with_jnp_dot(self, a, b, data_layout):
+        if data_layout[0] == "T":
+            a = jnp.swapaxes(a, -1, -2)
+        if data_layout[1] == "T":
+            b = jnp.swapaxes(b, -1, -2)
+        return jnp.dot(a, b)
+
+    def _generate_gemm_input(self, m, n, k, data_layout):
+        key = jax.random.PRNGKey(0)
+        subkeys = jax.random.split(key, 2)
+        x = jax.random.uniform(
+            subkeys[0],
+            (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m),
+            dtype=jnp.bfloat16,
+        ) / jnp.sqrt(k)
+        w = jax.random.uniform(
+            subkeys[1],
+            (k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k),
+            dtype=jnp.bfloat16,
+        ) / jnp.sqrt(n)
+        lhs_contracting_dim = (1,) if data_layout[0] == "N" else (0,)
+        rhs_contracting_dim = (0,) if data_layout[1] == "N" else (1,)
+        contracting_dims = (lhs_contracting_dim, rhs_contracting_dim)
+
+        return (x, w, contracting_dims)
+
+    @pytest_parametrize_wrapper("m,n,k", [(64, 32, 64)])
+    # We do not test NN and TT layouts here as they do not have both inputs using RHT due to RHT only supporting the colwise layout currently
+    @pytest_parametrize_wrapper("data_layout", ["TN", "NT"])
+    @pytest_parametrize_wrapper("with_jax_gemm", [True, False])
+    def test_rht_gemm(self, in_dtype, q_dtype, scaling_mode, m, n, k, data_layout, with_jax_gemm):
+        key = jax.random.PRNGKey(0)
+
+        lhs_scaling_mode, rhs_scaling_mode = scaling_mode, scaling_mode
+        x, w, contracting_dims = self._generate_gemm_input(m, n, k, data_layout)
+        lhs_quantizer = QuantizerFactory.create(
+            scaling_mode=lhs_scaling_mode,
+            q_dtype=jnp.float4_e2m1fn,
+            use_rht=True,
+        )
+        rhs_quantizer = QuantizerFactory.create(
+            scaling_mode=rhs_scaling_mode,
+            q_dtype=jnp.float4_e2m1fn,
+            use_rht=True,
+        )
+        with use_jax_gemm(enabled=with_jax_gemm):
+            primitive_out = tex.gemm(
+                x,
+                w,
+                contracting_dims=contracting_dims,
+                lhs_quantizer=lhs_quantizer,
+                rhs_quantizer=rhs_quantizer,
+            )
+        ref_out = self._ref_gemm_with_jnp_dot(x, w, data_layout)
+        assert_allclose(primitive_out, ref_out, dtype=jnp.float4_e2m1fn)
+
+
 @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason)
 @pytest_parametrize_wrapper("in_dtype", QUANTIZATION_INPUT_DTYPE)
 @pytest_parametrize_wrapper("input_shape", [(8, 16, 32)])
diff --git a/tests/jax/test_helper.py b/tests/jax/test_helper.py
index ca804625c6..fc88b7ef77 100644
--- a/tests/jax/test_helper.py
+++ b/tests/jax/test_helper.py
@@ -3,11 +3,13 @@
 # See LICENSE for license information.
 
 import unittest
+from functools import partial
 
 import flax
 import jax
 import jax.numpy as jnp
 import numpy as np
+from flax import linen as nn
 
 from utils import assert_allclose
 from transformer_engine.common.recipe import (
@@ -24,15 +26,51 @@
     ScalingMode,
     update_collections,
     TensorSource,
+    QuantizerFactory,
+    QuantizeLayout,
 )
 from transformer_engine.jax.quantize.helper import _format2dtypes
 from transformer_engine.jax.sharding import MeshResource, global_mesh_resource
+from transformer_engine.jax.flax.module import TransformerEngineBase
 
 is_fp8_supported, reason = is_scaling_mode_supported(ScalingMode.DELAYED_TENSOR_SCALING)
 is_mxfp8_supported, mxfp8_reason = is_scaling_mode_supported(ScalingMode.MXFP8_1D_SCALING)
 is_nvfp4_supported, nvfp4_reason = is_scaling_mode_supported(ScalingMode.NVFP4_1D_SCALING)
 
 
+def quantizer_check_vjp(outer_quantizer_set, assertion_func, x):
+    """Check that the quantizers in the quantizer set are as expected and reconstructed correctly from flattened pytree representations across VJP boundaries."""
+
+    # Define a function with a custom VJP (vector-Jacobian product)
+    @partial(jax.custom_vjp, nondiff_argnums=(1,))
+    def quantizer_check(inner_quantizer_set, assertion_func, x):
+        return quantizer_check_fwd(inner_quantizer_set, assertion_func, x)
+
+    def quantizer_check_fwd(inner_quantizer_set, assertion_func, x):
+        assertion_func(inner_quantizer_set.x, TensorSource.X)
+        assertion_func(inner_quantizer_set.kernel, TensorSource.KERNEL)
+        assertion_func(inner_quantizer_set.dgrad, TensorSource.DGRAD)
+        return x
+
+    def quantizer_check_bwd(ctx, g):
+        return (g,)
+
+    quantizer_check.defvjp(quantizer_check_fwd, quantizer_check_bwd)
+    return quantizer_check(outer_quantizer_set, assertion_func, x)
+
+
+class TestModule(TransformerEngineBase):
+    """A simple module to test quantizer creation and reconstruction across VJP boundaries."""
+
+    # Signature: (quantizer: Quantizer, tensor_source: TensorSource) -> None
+    assertion_func: callable
+
+    @nn.compact
+    def __call__(self, x):
+        quantizer_set = self.generate_quantizer_set()
+        return quantizer_check_vjp(quantizer_set, self.assertion_func, x)
+
+
 class TestHelper(unittest.TestCase):
 
     @unittest.skipIf(not is_fp8_supported, reason=reason)
@@ -89,12 +127,43 @@ def _compare_nvfp4_scaling(self, test):
         for tensor_source in TensorSource:
             target_scaling_mode = (
                 ScalingMode.NVFP4_2D_SCALING
-                if tensor_source == TensorSource.KERNEL
+                if (not test.disable_2d_quantization) and tensor_source == TensorSource.KERNEL
                 else ScalingMode.NVFP4_1D_SCALING
             )
             self.assertEqual(
                 get_quantize_config().get_scaling_mode(tensor_source), target_scaling_mode
             )
+        self.assertEqual(
+            get_quantize_config().DISABLE_STOCHASTIC_ROUNDING, test.disable_stochastic_rounding
+        )
+        self.assertEqual(get_quantize_config().DISABLE_RHT, test.disable_rht)
+        self.assertEqual(
+            get_quantize_config().DISABLE_2D_QUANTIZATION, test.disable_2d_quantization
+        )
+
+    def _compare_nvfp4_scaling_quantizers(self, test):
+        """Check that the quantizers created have the expected stochastic rounding state and the state is preserved across VJP boundaries."""
+
+        def assertion_func(quantizer, tensor_source):
+            if test.disable_stochastic_rounding or tensor_source != TensorSource.DGRAD:
+                self.assertIsNone(quantizer.stochastic_rounding_rng_state)
+            else:
+                self.assertIsNotNone(quantizer.stochastic_rounding_rng_state)
+
+            expected_rht = (
+                quantizer.scaling_mode == ScalingMode.NVFP4_1D_SCALING
+                and quantizer.q_layout in {QuantizeLayout.ROWWISE_COLWISE, QuantizeLayout.COLWISE}
+                and not test.disable_rht
+            )
+            self.assertEqual(quantizer.use_rht, expected_rht)
+
+        x = jnp.ones((), dtype=jnp.float32)
+        test_module = TestModule(assertion_func=assertion_func)
+        param_key, sr_key = jax.random.split(jax.random.PRNGKey(0))
+        rngs = {"params": param_key, "sr_rng": sr_key}
+        variables = test_module.init(rngs, x)
+
+        jax.jit(jax.value_and_grad(test_module.apply), static_argnums=(2,))(variables, x, rngs=rngs)
 
     @unittest.skipIf(not is_fp8_supported, reason=reason)
     def test_autocast_delayed_scaling(self):
@@ -171,5 +240,16 @@ def test_autocast_nvfp4_block_scaling(self):
         with autocast(enabled=True, recipe=bs, mesh_resource=MeshResource()):
             self.assertTrue(get_quantize_config().is_fp8_enabled())
             self._compare_nvfp4_scaling(bs)
+            self._compare_nvfp4_scaling_quantizers(bs)
+
+        bs = NVFP4BlockScaling(
+            disable_stochastic_rounding=True,
+            disable_rht=True,
+            disable_2d_quantization=True,
+        )
+        with autocast(enabled=True, recipe=bs, mesh_resource=MeshResource()):
+            self.assertTrue(get_quantize_config().is_fp8_enabled())
+            self._compare_nvfp4_scaling(bs)
+            self._compare_nvfp4_scaling_quantizers(bs)
 
         self._check_default_state()
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index b37c4bd848..778f77c0d5 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -44,7 +44,6 @@
     noop_quantizer_set,
     is_fp8_gemm_with_all_layouts_supported,
     apply_padding_to_scale_inv,
-    should_use_rht,
 )
 from .misc import get_padded_spec, is_all_reduce_in_float32
 from ..sharding import (
@@ -169,16 +168,13 @@ def _quantize_gemm_operands(lhs, rhs, lhs_quantizer, rhs_quantizer, contracting_
     assert not isinstance(lhs_q, ScaledTensor2x)
     assert not isinstance(rhs_q, ScaledTensor2x)
 
-    def uses_rht(q: AbstractBaseTensor) -> bool:
-        return isinstance(q, ScaledTensor1x) and should_use_rht(
-            q.scaling_mode, is_colwise=q.is_colwise
-        )
+    def has_rht_applied(q: AbstractBaseTensor) -> bool:
+        return isinstance(q, ScaledTensor1x) and q.has_rht_applied
 
-    # TODO(jberchtold): Move RHT usage check to a bool flag on the ScaledTensor class
-    assert uses_rht(lhs_q) == uses_rht(rhs_q), (
-        "With NVFP4_1D_SCALING, if one operand is colwise quantized, the other must be colwise"
-        " quantized as well. This is to ensure the RHT is applied to both and will cancel out in"
-        " the GEMM."
+    assert has_rht_applied(lhs_q) == has_rht_applied(rhs_q), (
+        "With NVFP4_1D_SCALING, if one operand is quantized with RHT, the other must be quantized"
+        " with RHT as well. This is to ensure the RHT is applied to both and will cancel out in the"
+        " GEMM."
     )
 
     return lhs_q, rhs_q
diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py
index b3f1e60f9a..67c505bc98 100644
--- a/transformer_engine/jax/cpp_extensions/quantization.py
+++ b/transformer_engine/jax/cpp_extensions/quantization.py
@@ -31,7 +31,7 @@
 from ..sharding import (
     all_reduce_max_along_all_axes_except_PP,
     all_reduce_sum_along_dp_fsdp,
-    num_of_devices,
+    get_num_devices_in_mesh,
 )
 from ..quantize import (
     ScaledTensor2x,
@@ -45,7 +45,6 @@
     compute_scale_from_amax,
     NoScaleTensor,
     get_rht_matrix,
-    should_use_rht,
 )
 
 
@@ -108,17 +107,18 @@ def abstract(
                 "sr_rng_state must be a uint32 array when stochastic_rounding is True but"
                 f" received {sr_rng_state_aval}"
             )
-            if is_outer:
+            if is_outer and get_num_devices_in_mesh() > 1:
                 assert (
-                    sr_rng_state_aval.shape[0] == num_of_devices()
+                    sr_rng_state_aval.shape[0] == get_num_devices_in_mesh()
                     and sr_rng_state_aval.shape[1] == 4
                 ), (
                     "sr_rng_state must be of shape (num_devices, 4) when stochastic_rounding is"
                     f" True and is_outer is True but received {sr_rng_state_aval.shape}"
                 )
             else:
-                assert sr_rng_state_aval.shape == (4,), (
-                    "Sharded sr_rng_state must be of shape (4,) per device when"
+                # We cannot assert the shape is exactly (4,) here because if the quantized data is not perfectly sharded across all devices then we will have extra rng state here. For example, this could occur when the weights are not sharded when using data parallelism. However, this is okay because the extra rng state will simply not be used and each device still has a unique rng state.
+                assert sr_rng_state_aval.size >= 4, (
+                    "Sharded sr_rng_state must have at least 4 elements per device when"
                     f" stochastic_rounding is True but received {sr_rng_state_aval.shape}"
                 )
 
@@ -552,8 +552,13 @@ def partition(
             desc="BaseDBiasQuantizePrimitive.colwise_scale_inv",
         )
 
-        # TODO(jberchtold): Assert the sr_rng state is sharded along all mesh axes
-        arg_shardings = tuple(arg_i.sharding for arg_i in arg_infos)
+        arg_shardings = list(arg_i.sharding for arg_i in arg_infos)
+        arg_shardings[3] = NamedSharding(
+            mesh,
+            PartitionSpec(tuple(x for x in x_spec if x is not None), None),
+            desc="BaseDBiasQuantizePrimitive.sr_rng_state",
+        )
+        arg_shardings = tuple(arg_shardings)
         out_shardings = (
             out_sharding,
             colwise_out_sharding,
@@ -564,6 +569,9 @@ def partition(
         )
 
         def sharded_impl(x, scale, amax, sr_rng_state, post_rht_amax, rht_matrix):
+            if sr_rng_state.size > 4:
+                # See comment in abstract method for explanation of why we cannot assert exact shape
+                sr_rng_state = sr_rng_state.flatten()[:4]
             (
                 local_x,
                 local_colwise_x,
@@ -754,9 +762,10 @@ def _quantize_dbias_impl(
     # If TE/common custom quantize op is disabled, or if quantizer layout is COLWISE,
     # fall back on the native-JAX quantize implementation
     PrimitiveClass = DBiasQuantizePrimitive if is_dbias else QuantizePrimitive
-    is_unsupported = (
-        quantizer.q_layout == QuantizeLayout.COLWISE
-        and quantizer.scaling_mode != ScalingMode.NVFP4_1D_SCALING
+    is_unsupported = quantizer.q_layout == QuantizeLayout.COLWISE and not (
+        quantizer.scaling_mode == ScalingMode.NVFP4_1D_SCALING
+        and hasattr(quantizer, "use_rht")
+        and quantizer.use_rht
     )
     if is_unsupported or not PrimitiveClass.enabled():
         if is_dbias:
@@ -792,7 +801,7 @@ def _quantize_dbias_impl(
     rht_matrix = jnp.empty((1, 1), jnp.bfloat16)
     amax = x.amax
 
-    if should_use_rht(quantizer.scaling_mode, q_layout=quantizer.q_layout):
+    if hasattr(quantizer, "use_rht") and quantizer.use_rht:
         use_rht = True
         rht_matrix = get_rht_matrix()
 
@@ -861,7 +870,11 @@ def _quantize_dbias_impl(
         x.data,
         scale,
         amax,
-        sr_rng_state if sr_rng_state is not None else jnp.empty((num_of_devices(), 1), jnp.uint32),
+        (
+            sr_rng_state
+            if sr_rng_state is not None
+            else jnp.empty((get_num_devices_in_mesh(), 1), jnp.uint32)
+        ),
         post_rht_amax if post_rht_amax is not None else jnp.zeros((1,), jnp.float32),
         rht_matrix,
         out_dtype=quantizer.q_dtype,
@@ -902,6 +915,7 @@ def _quantize_dbias_impl(
         q_layout=quantizer.q_layout,
         data_layout=quantizer.get_data_layout(),
         flatten_axis=flatten_axis,
+        colwise_has_rht_applied=use_rht,
     )
     return out, dbias.astype(dq_dtype)
 
diff --git a/transformer_engine/jax/quantize/dequantizer.py b/transformer_engine/jax/quantize/dequantizer.py
index b4da6f3bed..80ebc6b875 100644
--- a/transformer_engine/jax/quantize/dequantizer.py
+++ b/transformer_engine/jax/quantize/dequantizer.py
@@ -15,7 +15,7 @@
 import jax.numpy as jnp
 
 from .scaling_modes import ScalingMode
-from .hadamard import apply_rht, should_use_rht
+from .hadamard import apply_rht
 
 
 __all__ = ["ScalingModeToDequantizerMap"]
@@ -171,7 +171,9 @@ class NVFP4Dequantizer(Dequantizer):
     """
 
     @staticmethod
-    def _dequantize_func(data, scale_inv, amax, dq_dtype, scaling_mode, is_colwise, flatten_axis):
+    def _dequantize_func(
+        data, scale_inv, amax, dq_dtype, scaling_mode, is_colwise, flatten_axis, has_rht_applied
+    ):
         """Dequantize a tensor using block scaling.
 
         Args:
@@ -182,6 +184,7 @@ def _dequantize_func(data, scale_inv, amax, dq_dtype, scaling_mode, is_colwise,
             scaling_mode: The scaling mode used for quantization
             is_colwise: Whether the scaling is column-wise
             flatten_axis: The axis along which the tensor could be flattened to 2D
+            has_rht_applied: Whether the quantization has RHT applied and we need to apply the inverse RHT to dequantize
 
         Returns:
             The dequantized tensor
@@ -223,8 +226,7 @@ def _dequantize_func(data, scale_inv, amax, dq_dtype, scaling_mode, is_colwise,
         out = jnp.asarray(data * scale_inv, dq_dtype).reshape(data_shape)
 
         # Apply inverse of RHT if needed
-        use_rht = should_use_rht(scaling_mode, is_colwise=is_colwise)
-        if use_rht:
+        if has_rht_applied:
             out = apply_rht(out, inverse=True)
 
         return out
@@ -247,6 +249,7 @@ def dequantize(scaled_tensor):
             scaled_tensor.scaling_mode,
             scaled_tensor.is_colwise,
             scaled_tensor.flatten_axis,
+            scaled_tensor.has_rht_applied,
         )
 
 
diff --git a/transformer_engine/jax/quantize/hadamard.py b/transformer_engine/jax/quantize/hadamard.py
index c0b74ef75e..5f6f0ec2b5 100644
--- a/transformer_engine/jax/quantize/hadamard.py
+++ b/transformer_engine/jax/quantize/hadamard.py
@@ -4,32 +4,6 @@
 """Randomized Hadamard Transform (RHT) utilities for JAX."""
 import jax.numpy as jnp
 
-from .scaling_modes import ScalingMode
-
-
-def should_use_rht(scaling_mode, is_colwise=None, q_layout=None) -> bool:
-    """Determine if RHT (Randomized Hadamard Transform) should be used.
-
-    Args:
-        scaling_mode: The scaling mode of the tensor.
-        is_colwise: Whether the tensor is column-wise. Only one of is_colwise or q_layout should be provided.
-        q_layout: The quantization layout of the tensor. Only one of is_colwise or q_layout should be provided.
-
-    Returns:
-        bool: True if RHT should be used, False otherwise.
-    """
-    # Delayed import to avoid circular dependencies
-    from .quantizer import QuantizeLayout
-
-    assert (is_colwise is None) != (
-        q_layout is None
-    ), "Exactly one of is_colwise or q_layout must be provided."
-
-    if q_layout is not None:
-        is_colwise = q_layout in {QuantizeLayout.COLWISE, QuantizeLayout.ROWWISE_COLWISE}
-
-    return scaling_mode == ScalingMode.NVFP4_1D_SCALING and is_colwise
-
 
 def get_wgrad_sign_vector() -> list[int]:
     """Get a fixed sign vector for the RHT used in NVFP4 weight gradient quantization."""
diff --git a/transformer_engine/jax/quantize/helper.py b/transformer_engine/jax/quantize/helper.py
index 06c67b62ee..e8b33c1d1c 100644
--- a/transformer_engine/jax/quantize/helper.py
+++ b/transformer_engine/jax/quantize/helper.py
@@ -12,6 +12,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
+import hashlib
 from typing import Optional, Tuple, Dict, Union, Sequence, Type, List
 from functools import reduce, lru_cache
 import operator
@@ -35,7 +36,7 @@
 from transformer_engine.jax.sharding import (
     global_shard_guard,
     MeshResource,
-    num_of_devices,
+    get_num_devices_in_mesh,
     get_all_mesh_axes,
     with_sharding_constraint,
 )
@@ -561,29 +562,87 @@ def get_quantize_flax_meta(
         return QuantizeMeta()
 
 
+@dataclass
 class NVFP4ScalingQuantizeConfig(BaseQuantizeConfig):
     """Configuration class for NVFP4 scaling recipe.
 
     This class provides specific initialization and finalization for NVFP4 scaling quantization mode.
     """
 
+    DISABLE_STOCHASTIC_ROUNDING: bool = False
+    DISABLE_RHT: bool = False
+    DISABLE_2D_QUANTIZATION: bool = False
+
     def initialize_from_recipe(self, fp8_recipe: Recipe) -> None:
-        """Initialize block scaling FP8 configuration.
+        """Initialize block scaling NVFP4 configuration.
 
         Args:
-            fp8_recipe: The FP8 recipe to use for initialization
+            fp8_recipe: The quantization recipe to use for initialization
         """
+        assert isinstance(fp8_recipe, NVFP4BlockScaling)
+
         self.INITIALIZED = True
         self.FWD_DTYPE, self.BWD_DTYPE = _format2dtypes(fp8_recipe.fp4_format)
         self.AMAX_HISTORY_LEN = 0
 
+        self.DISABLE_STOCHASTIC_ROUNDING = fp8_recipe.disable_stochastic_rounding
+        self.DISABLE_RHT = fp8_recipe.disable_rht
+        self.DISABLE_2D_QUANTIZATION = fp8_recipe.disable_2d_quantization
+
     def get_scaling_mode(self, tensor_source: TensorSource) -> ScalingMode:
         """Gets the scaling mode for a specific tensor's usage type."""
-        if tensor_source == TensorSource.KERNEL:
+        if (not self.DISABLE_2D_QUANTIZATION) and tensor_source == TensorSource.KERNEL:
             return ScalingMode.NVFP4_2D_SCALING
         # for x and grad
         return ScalingMode.NVFP4_1D_SCALING
 
+    def _make_rht_quantize_meta(self, q_layout, tensor_source: TensorSource) -> QuantizeMeta:
+        """Create the quantization metadata for RHT if applicable."""
+        # Imported here to prevent circular import
+        from transformer_engine.jax.quantize import QuantizeLayout
+
+        use_rht = self.get_scaling_mode(
+            tensor_source
+        ) == ScalingMode.NVFP4_1D_SCALING and q_layout in {
+            QuantizeLayout.ROWWISE_COLWISE,
+            QuantizeLayout.COLWISE,
+        }
+        if self.DISABLE_RHT:
+            use_rht = False
+        return QuantizeMeta(use_rht=use_rht)
+
+    def _make_stochastic_rounding_rng_state(
+        self, module, tensor_source: TensorSource, quantizer_name: str
+    ) -> jnp.ndarray:
+        """Create the stochastic rounding rng state if applicable."""
+        if self.DISABLE_STOCHASTIC_ROUNDING:
+            return QuantizeMeta()
+
+        if tensor_source != TensorSource.DGRAD:
+            # Only DGRAD uses stochastic rounding
+            return QuantizeMeta()
+
+        sr_jax_rng = module.make_rng("sr_rng")
+        # Get a unique key for this quantizer
+        # Use hashlib to get a deterministic hash value for quantizer_name
+        quantizer_hash = (
+            int(hashlib.sha256(quantizer_name.encode("utf-8")).hexdigest(), 16)
+            % jnp.iinfo(jnp.int32).max
+        )
+        sr_jax_rng = jax.jit(jax.random.fold_in)(sr_jax_rng, quantizer_hash)
+
+        # Generate 4 random uint32 values from the JAX PRNG key
+        shape = (4,)
+        if get_num_devices_in_mesh() > 1:
+            shape = (get_num_devices_in_mesh(), 4)
+        sr_jax_rng_state = jax.random.randint(
+            sr_jax_rng, shape, 0, jnp.iinfo(jnp.int32).max, dtype=jnp.int32
+        ).view(jnp.uint32)
+        sr_jax_rng_state = with_sharding_constraint(
+            sr_jax_rng_state, jax.sharding.PartitionSpec(get_all_mesh_axes(), None)
+        )
+        return QuantizeMeta(stochastic_rounding_rng_state=sr_jax_rng_state)
+
     def get_quantize_flax_meta(
         self,
         module,
@@ -603,27 +662,14 @@ def get_quantize_flax_meta(
         Returns:
             The quantization metadata for the specified module and tensor. It can be empty if no metadata is needed.
         """
-        if tensor_source != TensorSource.DGRAD:
-            # Only DGRAD uses stochastic rounding
-            return QuantizeMeta()
-
-        # TODO(jberchtold): This assumes SR is always enabled for NVFP4. Use flag from recipe to toggle it.
-        sr_jax_rng = module.make_rng("sr_rng")
-        # Get a unique key for this quantizer
-        sr_jax_rng = jax.jit(jax.random.fold_in)(
-            sr_jax_rng, hash(quantizer_name) % jnp.iinfo(jnp.int32).max
-        )
+        # Imported here to prevent circular import
+        from transformer_engine.jax.quantize import QuantizeLayout
 
-        # Generate 4 random uint32 values from the JAX PRNG key
-        sr_jax_rng_state = jax.random.randint(
-            sr_jax_rng, (num_of_devices(), 4), 0, jnp.iinfo(jnp.int32).max, dtype=jnp.int32
-        ).view(jnp.uint32)
-        sr_jax_rng_state = with_sharding_constraint(
-            sr_jax_rng_state, jax.sharding.PartitionSpec(get_all_mesh_axes(), None)
+        return QuantizeMeta.merge(
+            self._make_rht_quantize_meta(QuantizeLayout.ROWWISE_COLWISE, tensor_source),
+            self._make_stochastic_rounding_rng_state(module, tensor_source, quantizer_name),
         )
 
-        return QuantizeMeta(stochastic_rounding_rng_state=sr_jax_rng_state)
-
 
 _QUANTIZE_CONFIG = NoOpQuantizeConfig()
 
diff --git a/transformer_engine/jax/quantize/metadata.py b/transformer_engine/jax/quantize/metadata.py
index 11a349ed7d..a987643eb7 100644
--- a/transformer_engine/jax/quantize/metadata.py
+++ b/transformer_engine/jax/quantize/metadata.py
@@ -26,6 +26,26 @@ class QuantizeMeta:
 
     """
 
+    @staticmethod
+    def merge(a: "QuantizeMeta", b: "QuantizeMeta") -> "QuantizeMeta":
+        """Merge two QuantizeMeta instances.
+
+        Args:
+            a (QuantizeMeta): The first QuantizeMeta instance.
+            b (QuantizeMeta): The second QuantizeMeta instance.
+
+        Returns:
+            QuantizeMeta: A new QuantizeMeta instance with merged metadata.
+        """
+        assert isinstance(a, QuantizeMeta)
+        assert isinstance(b, QuantizeMeta)
+        for key in b.get_kwargs_dictionary().keys():
+            if key in a.get_kwargs_dictionary():
+                assert (
+                    a.get_kwargs_dictionary()[key] == b.get_kwargs_dictionary()[key]
+                ), f"Conflict in merging QuantizeMeta: {key} has different values."
+        return QuantizeMeta(**{**a.get_kwargs_dictionary(), **b.get_kwargs_dictionary()})
+
     def __init__(self, **kwargs):
         self._kwargs = kwargs
 
diff --git a/transformer_engine/jax/quantize/quantizer.py b/transformer_engine/jax/quantize/quantizer.py
index 7bc08f834f..d138b58dad 100644
--- a/transformer_engine/jax/quantize/quantizer.py
+++ b/transformer_engine/jax/quantize/quantizer.py
@@ -19,7 +19,7 @@
 from transformer_engine.common import recipe
 
 from .scaling_modes import ScalingMode
-from .hadamard import apply_rht, should_use_rht
+from .hadamard import apply_rht
 from .tensor import (
     ScaledTensor,
     ScaledTensor1x,
@@ -590,11 +590,13 @@ class NVFP4Quantizer(Quantizer):
         q_layout: Quantization axis
         data_layout: Data layout string (default: "NT")
         stochastic_rounding_rng_state: RNG state for stochastic rounding, must be of shape (4,) and dtype uint32. If None, stochastic rounding is disabled.
+        use_rht: Whether to apply Randomized Hadamard Transform (RHT) before quantization.
     """
 
     scaling_mode: ScalingMode = ScalingMode.NVFP4_1D_SCALING
     q_layout: QuantizeLayout = QuantizeLayout.ROWWISE_COLWISE
     data_layout: str = "NT"
+    use_rht: bool = False
     stochastic_rounding_rng_state: Optional[jnp.ndarray] = None
 
     def __post_init__(self):
@@ -603,6 +605,30 @@ def __post_init__(self):
         ), "NVFP4 quantization must use a q_dtype of float4_e2m1fn"
         assert self.scaling_mode.is_nvfp4_scaling, "NVFP4Quantizer must use NVFP4 scaling modes"
 
+    def tree_flatten(self):
+        """Flatten the quantizer for JAX tree operations.
+
+        Returns:
+            Tuple of (children, aux_data) for tree operations
+        """
+        children = (self.stochastic_rounding_rng_state,)
+        aux_data = (self.q_dtype, self.scaling_mode, self.q_layout, self.data_layout, self.use_rht)
+        return (children, aux_data)
+
+    @classmethod
+    def tree_unflatten(cls, aux_data, children):
+        """Reconstruct a quantizer from its flattened representation.
+
+        Args:
+            aux_data: Auxiliary data containing quantizer parameters
+            children: Unused children data
+
+        Returns:
+            A reconstructed Quantizer instance
+        """
+        stochastic_rounding_rng_state = children[0]
+        return cls(*aux_data, stochastic_rounding_rng_state=stochastic_rounding_rng_state)
+
     def _apply_stochastic_rounding(self, x):
         assert (
             self.stochastic_rounding_rng_state is not None
@@ -688,8 +714,9 @@ def _quantize_func(self, x, is_colwise=False, dq_dtype=None, flatten_axis=-1) ->
             flatten_axis = x.ndim - flatten_axis
         x_shape = x.shape
 
-        if should_use_rht(self.scaling_mode, is_colwise=is_colwise):
-            # We only apply RHT for 1D colwise nvfp4
+        # We currently only have a single flag 'use_rht' on the quantizer. To avoid an unused rowwise flag, we assume RHT is only used for colwise quantization for now.
+        use_rht = self.use_rht and is_colwise and self.scaling_mode == ScalingMode.NVFP4_1D_SCALING
+        if use_rht:
             x = apply_rht(x)
 
         dq_dtype = dq_dtype if dq_dtype is not None else x.dtype
@@ -790,6 +817,7 @@ def repeat_to_shape(x, target_shape):
             scaling_mode=self.scaling_mode,
             dq_dtype=dq_dtype,
             flatten_axis=rowwise_flatten_axis,
+            has_rht_applied=use_rht,
         )
 
 
diff --git a/transformer_engine/jax/quantize/tensor.py b/transformer_engine/jax/quantize/tensor.py
index 2d2d78190f..6c358a044e 100644
--- a/transformer_engine/jax/quantize/tensor.py
+++ b/transformer_engine/jax/quantize/tensor.py
@@ -175,6 +175,7 @@ class ScaledTensor1x(AbstractBaseTensor1x, ScaledTensor):
         is_colwise: Whether the tensor uses column-wise quantization
         data_layout: The data_layout specification for the tensor
         flatten_axis: The quantization axis for the tensor
+        has_rht_applied: Whether the tensor had the Randomized Hadamard Transform (RHT) applied during quantization
     """
 
     scale_inv: jnp.ndarray
@@ -184,6 +185,7 @@ class ScaledTensor1x(AbstractBaseTensor1x, ScaledTensor):
     is_colwise: bool
     data_layout: str
     flatten_axis: int
+    has_rht_applied: bool
 
     def __post_init__(self):
         """Validates and adjusts the scale_inv shape after initialization.
@@ -243,6 +245,7 @@ def tree_flatten(self):
             self.is_colwise,
             self.data_layout,
             self.flatten_axis,
+            self.has_rht_applied,
         )
         return (children, aux_data)
 
@@ -314,6 +317,7 @@ def apply_sharding_constraint_by_logical_axes(self, logical_axis_names: Tuple[st
             is_colwise=self.is_colwise,
             data_layout=self.data_layout,
             flatten_axis=self.flatten_axis,
+            has_rht_applied=self.has_rht_applied,
         )
 
 
@@ -354,6 +358,7 @@ def __init__(
         self.group_sizes = group_sizes
         self.original_shape = original_shape
         self.group_axis = group_axis
+        # TODO(Phuong):Handle RHT for grouped quantization once grouped quantization supports NVFP4
         super().__init__(
             data=data,
             scale_inv=scale_inv,
@@ -364,6 +369,7 @@ def __init__(
             is_colwise=is_colwise,
             data_layout=data_layout,
             flatten_axis=flatten_axis,
+            has_rht_applied=False,
         )
 
     def __post_init__(self):
@@ -515,6 +521,7 @@ def create_1x(
         group_sizes=None,
         original_shape=None,
         group_axis=0,
+        has_rht_applied=False,
     ):
         """Creates a single-scale quantized tensor.
 
@@ -530,6 +537,7 @@ def create_1x(
             group_sizes: Array of ints containing the size of each group (default: None)
             original_shape: The original shape of the tensor before grouping (default: None)
             group_axis: The axis along which grouping is performed (default: 0)
+            has_rht_applied: Whether the tensor had the Randomized Hadamard Transform (RHT) applied during quantization (default: False)
 
         Returns:
             A ScaledTensor1x or GroupedScaledTensor1x instance depending on whether group_sizes is provided
@@ -593,6 +601,7 @@ def create_1x(
             is_colwise=is_colwise,
             data_layout=data_layout,
             flatten_axis=flatten_axis,
+            has_rht_applied=has_rht_applied,
         )
 
     @staticmethod
@@ -610,6 +619,8 @@ def create_2x(
         group_sizes=None,
         original_shape=None,
         group_axis=0,
+        rowwise_has_rht_applied=False,
+        colwise_has_rht_applied=False,
     ):
         """Creates a double-scale quantized tensor.
 
@@ -626,6 +637,8 @@ def create_2x(
             group_sizes: Array containing the size of each group (default: None)
             original_shape: The original shape of the tensor before grouping (default: None)
             group_axis: The axis along which grouping is performed (default: 0)
+            rowwise_has_rht_applied: Whether the row-wise tensor uses the Randomized Hadamard Transform (RHT) (default: False)
+            colwise_has_rht_applied: Whether the column-wise tensor uses the Randomized Hadamard Transform (RHT) (default: False)
 
         Returns:
             A ScaledTensor2x instance
@@ -648,6 +661,7 @@ def create_2x(
             group_sizes=group_sizes,
             original_shape=original_shape,
             group_axis=group_axis,
+            has_rht_applied=rowwise_has_rht_applied,
         )
         colwise_tensor = ScaledTensorFactory.create_1x(
             colwise_data,
@@ -661,6 +675,7 @@ def create_2x(
             group_sizes=group_sizes,
             original_shape=original_shape,
             group_axis=group_axis,
+            has_rht_applied=colwise_has_rht_applied,
         )
         return ScaledTensor2x(rowwise_tensor, colwise_tensor)
 
@@ -680,6 +695,8 @@ def create(
         group_sizes: jnp.ndarray = None,
         original_shape: Tuple[int] = None,
         group_axis: int = 0,
+        rowwise_has_rht_applied: bool = False,
+        colwise_has_rht_applied: bool = False,
     ):
         """Creates a scaled tensor based on the quantization axis.
 
@@ -696,10 +713,14 @@ def create(
             group_sizes: Array containing the size of each group (default: None)
             original_shape: The original shape of the tensor before grouping (default: None)
             group_axis: The axis along which grouping is performed (default: 0)
+            rowwise_has_rht_applied: Whether the row-wise tensor uses the Randomized Hadamard Transform (RHT) (default: False)
+            colwise_has_rht_applied: Whether the col-wise tensor uses the Randomized Hadamard Transform (RHT) (default: False)
 
         Returns:
             Either a ScaledTensor1x or ScaledTensor2x instance depending on q_layout
         """
+        assert not rowwise_has_rht_applied, "RHT is not supported for rowwise quantization yet"
+
         if q_layout == QuantizeLayout.ROWWISE_COLWISE:
             return ScaledTensorFactory.create_2x(
                 data,
@@ -715,6 +736,8 @@ def create(
                 group_sizes=group_sizes,
                 original_shape=original_shape,
                 group_axis=group_axis,
+                rowwise_has_rht_applied=rowwise_has_rht_applied,
+                colwise_has_rht_applied=colwise_has_rht_applied,
             )
 
         is_colwise = q_layout == QuantizeLayout.COLWISE
@@ -731,6 +754,7 @@ def create(
                 group_sizes=group_sizes,
                 original_shape=original_shape,
                 group_axis=group_axis,
+                has_rht_applied=colwise_has_rht_applied,
             )
 
         return ScaledTensorFactory.create_1x(
@@ -745,6 +769,7 @@ def create(
             group_sizes=group_sizes,
             original_shape=original_shape,
             group_axis=group_axis,
+            has_rht_applied=rowwise_has_rht_applied,
         )
 
 
diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py
index 8eeaca4cc8..adb67e358f 100644
--- a/transformer_engine/jax/sharding.py
+++ b/transformer_engine/jax/sharding.py
@@ -238,6 +238,19 @@ def num_of_devices():
     return len(jax.devices())
 
 
+def get_num_devices_in_mesh(mesh=None):
+    """
+    Get the number of devices in the given mesh.
+    If the mesh is None, it would be replaced
+    by the global mesh.
+    """
+    if mesh is None:
+        mesh = _PXLA_THREAD_RESOURCES.env.physical_mesh
+    if mesh.empty:
+        return 1
+    return np.prod(list(mesh.shape.values()))
+
+
 def get_mesh_axis_size(axis, mesh=None):
     """
     Get the axis size of the given mesh.

From 9b75db3765f48c5d791f385779ec8d4daa0d7c11 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Wed, 22 Oct 2025 20:33:49 -0400
Subject: [PATCH 05/59] Include TE core headers in final build (#2291)

Include TE core headers in build

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000..c34025772a
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include transformer_engine/common/include *.*

From 8b9849a226c37601cf2826108e02df6db041f23a Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Wed, 22 Oct 2025 22:31:08 -0700
Subject: [PATCH 06/59] Overhaul the compilation for the arch-specific features
 (#2279)

* Added sm_120f to the build

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Change the arch specific handling

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Support for CUDA<12.9

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Moved through the rest of the files

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Common cases

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Remove pure 100 from the list

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* CMake changes, (not yet working)

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Do not pass the arch-specific thing from build_tools

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Moved some of the files to arch-specific compilation

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix and also changing the order of compilation to hopefully get the
compilation time lower

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fix for the files overwriting custom compile properties

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Actually make this whole thing work

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add space to the error message

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com>

* Apply suggestions from code review

Co-authored-by: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com>
Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com>

* Fixes from review

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Changing the naming to be more intuitive

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add missing cassert include for device-side asserts

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com>
---
 build_tools/utils.py                          |   6 +-
 transformer_engine/common/CMakeLists.txt      | 206 +++++++++---
 .../hadamard_transform_cast_fusion.cu         |  27 +-
 ...quantize_transpose_vector_blockwise_fp4.cu |  76 ++---
 .../common/util/nvfp4_transpose.cuh           | 290 ++++++++--------
 transformer_engine/common/util/ptx.cuh        | 310 +++++++++++++++---
 transformer_engine/common/utils.cuh           |   1 +
 7 files changed, 610 insertions(+), 306 deletions(-)

diff --git a/build_tools/utils.py b/build_tools/utils.py
index 296f928b71..395b41261b 100644
--- a/build_tools/utils.py
+++ b/build_tools/utils.py
@@ -257,11 +257,9 @@ def cuda_archs() -> str:
     if archs is None:
         version = cuda_version()
         if version >= (13, 0):
-            archs = "75;80;89;90;100;100a;103a;120"
-        elif version >= (12, 9):
-            archs = "70;80;89;90;100;100a;103a;120"
+            archs = "75;80;89;90;100;120"
         elif version >= (12, 8):
-            archs = "70;80;89;90;100;100a;120"
+            archs = "70;80;89;90;100;120"
         else:
             archs = "70;80;89;90"
     return archs
diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
index e6be47686a..175abd3530 100644
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -5,15 +5,6 @@
 cmake_minimum_required(VERSION 3.21)
 
 # Language options
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
-    set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
-  elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
-    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120)
-  else ()
-    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
-  endif()
-endif()
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -30,8 +21,62 @@ project(transformer_engine LANGUAGES CUDA CXX)
 
 # CUDA Toolkit
 find_package(CUDAToolkit REQUIRED)
-if (CUDAToolkit_VERSION VERSION_LESS 12.0)
-  message(FATAL_ERROR "CUDA 12.0+ is required, but found CUDA ${CUDAToolkit_VERSION}")
+if (CUDAToolkit_VERSION VERSION_LESS 12.1)
+  message(FATAL_ERROR "CUDA 12.1+ is required, but found CUDA ${CUDAToolkit_VERSION}")
+endif()
+
+# Process GPU architectures
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
+    set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
+  elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120)
+  else ()
+    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
+  endif()
+endif()
+
+# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
+set(NVTE_GENERIC_ARCHS)
+set(NVTE_SPECIFIC_ARCHS)
+
+# Check for architecture 100
+list(FIND CMAKE_CUDA_ARCHITECTURES "100" arch_100_index)
+if(NOT arch_100_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "100")
+  list(APPEND NVTE_GENERIC_ARCHS "100")
+  list(APPEND NVTE_SPECIFIC_ARCHS "100a")
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
+    list(APPEND NVTE_SPECIFIC_ARCHS "103a")
+  endif()
+endif()
+
+# Check for architecture 101 (if we see this we are in toolkit <= 12.9)
+list(FIND CMAKE_CUDA_ARCHITECTURES "101" arch_101_index)
+if(NOT arch_101_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "101")
+  list(APPEND NVTE_GENERIC_ARCHS "101")
+  list(APPEND NVTE_SPECIFIC_ARCHS "101a")
+endif()
+
+# Check for architecture 110 (if we see this we are in toolkit >= 13.0)
+list(FIND CMAKE_CUDA_ARCHITECTURES "110" arch_110_index)
+if(NOT arch_110_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110")
+  list(APPEND NVTE_GENERIC_ARCHS "110")
+  list(APPEND NVTE_SPECIFIC_ARCHS "110f")
+endif()
+
+# Check for architecture 120
+list(FIND CMAKE_CUDA_ARCHITECTURES "120" arch_120_index)
+if(NOT arch_120_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "120")
+  list(APPEND NVTE_GENERIC_ARCHS "120")
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
+    list(APPEND NVTE_SPECIFIC_ARCHS "120f")
+  else()
+    list(APPEND NVTE_SPECIFIC_ARCHS "120a")
+  endif()
 endif()
 
 # cuDNN frontend API
@@ -78,9 +123,28 @@ endif()
 # Configure Transformer Engine library
 include_directories(${PROJECT_SOURCE_DIR}/..)
 set(transformer_engine_SOURCES)
-list(APPEND transformer_engine_SOURCES
+set(transformer_engine_cpp_sources)
+set(transformer_engine_cuda_sources)
+set(transformer_engine_cuda_arch_specific_sources)
+
+list(APPEND transformer_engine_cpp_sources
      cudnn_utils.cpp
      transformer_engine.cpp
+     fused_attn/fused_attn.cpp
+     gemm/config.cpp
+     normalization/common.cpp
+     normalization/layernorm/ln_api.cpp
+     normalization/rmsnorm/rmsnorm_api.cpp
+     util/cuda_driver.cpp
+     util/cuda_nvml.cpp
+     util/cuda_runtime.cpp
+     util/multi_stream.cpp
+     util/rtc.cpp
+     comm_gemm_overlap/userbuffers/ipcsocket.cc
+     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+     comm_gemm_overlap/comm_gemm_overlap.cpp)
+
+list(APPEND transformer_engine_cuda_sources
      common.cu
      multi_tensor/adam.cu
      multi_tensor/compute_scale.cu
@@ -92,40 +156,23 @@ list(APPEND transformer_engine_SOURCES
      transpose/cast_transpose_fusion.cu
      transpose/transpose_fusion.cu
      transpose/multi_cast_transpose.cu
-     transpose/quantize_transpose_square_blockwise.cu
      transpose/quantize_transpose_vector_blockwise.cu
      transpose/swap_first_dims.cu
-     transpose/quantize_transpose_vector_blockwise_fp4.cu
-     activation/gelu.cu
      dropout/dropout.cu
      fused_attn/flash_attn.cu
      fused_attn/context_parallel.cu
      fused_attn/kv_cache.cu
      fused_attn/fused_attn_f16_max512_seqlen.cu
      fused_attn/fused_attn_f16_arbitrary_seqlen.cu
-     activation/relu.cu
-     activation/swiglu.cu
      fused_attn/fused_attn_fp8.cu
-     fused_attn/fused_attn.cpp
      fused_attn/utils.cu
-     gemm/config.cpp
      gemm/cublaslt_gemm.cu
-     gemm/cutlass_grouped_gemm.cu
-     normalization/common.cpp
-     normalization/layernorm/ln_api.cpp
      normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
      normalization/layernorm/ln_fwd_cuda_kernel.cu
-     normalization/rmsnorm/rmsnorm_api.cpp
      normalization/rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu
      normalization/rmsnorm/rmsnorm_fwd_cuda_kernel.cu
      permutation/permutation.cu
-     util/cast.cu
      util/padding.cu
-     util/cuda_driver.cpp
-     util/cuda_nvml.cpp
-     util/cuda_runtime.cpp
-     util/multi_stream.cpp
-     util/rtc.cpp
      swizzle/swizzle.cu
      swizzle/swizzle_block_scaling.cu
      fused_softmax/scaled_masked_softmax.cu
@@ -139,12 +186,58 @@ list(APPEND transformer_engine_SOURCES
      recipe/delayed_scaling.cu
      recipe/fp8_block_scaling.cu
      recipe/nvfp4.cu
+     comm_gemm_overlap/userbuffers/userbuffers.cu)
+
+list(APPEND transformer_engine_cuda_arch_specific_sources
+     gemm/cutlass_grouped_gemm.cu
+     util/cast.cu
+     activation/gelu.cu
+     activation/relu.cu
+     activation/swiglu.cu
+     transpose/quantize_transpose_square_blockwise.cu
+     transpose/quantize_transpose_vector_blockwise_fp4.cu
      hadamard_transform/hadamard_transform.cu
-     hadamard_transform/hadamard_transform_cast_fusion.cu
-     comm_gemm_overlap/userbuffers/ipcsocket.cc
-     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
-     comm_gemm_overlap/userbuffers/userbuffers.cu
-     comm_gemm_overlap/comm_gemm_overlap.cpp)
+     hadamard_transform/hadamard_transform_cast_fusion.cu)
+
+# Compiling the files with the worst compilation time first to hopefully overlap
+# better with the faster-compiling cpp files
+list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_sources}
+                                       ${transformer_engine_cuda_sources}
+                                       ${transformer_engine_cpp_sources})
+
+# Set compile options for CUDA sources with generic architectures
+foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
+  set(arch_compile_options)
+  foreach(arch IN LISTS NVTE_GENERIC_ARCHS)
+    list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
+  endforeach()
+
+  if(arch_compile_options)
+    set_property(
+      SOURCE ${cuda_source}
+      APPEND
+      PROPERTY
+      COMPILE_OPTIONS ${arch_compile_options}
+    )
+  endif()
+endforeach()
+
+# Set compile options for CUDA sources with specific architectures
+foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
+  set(arch_compile_options)
+  foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS)
+    list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
+  endforeach()
+
+  if(arch_compile_options)
+    set_property(
+      SOURCE ${cuda_source}
+      APPEND
+      PROPERTY
+      COMPILE_OPTIONS ${arch_compile_options}
+    )
+  endif()
+endforeach()
 
 if (NVTE_WITH_CUBLASMP)
 list(APPEND transformer_engine_SOURCES
@@ -249,28 +342,35 @@ target_include_directories(transformer_engine PRIVATE
                            "${CMAKE_CURRENT_BINARY_DIR}/string_headers")
 
 # Compiler options
-set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
-                            fused_softmax/scaled_upper_triang_masked_softmax.cu
-                            fused_softmax/scaled_aligned_causal_masked_softmax.cu
-                            multi_tensor/adam.cu
-                            multi_tensor/compute_scale.cu
-                            multi_tensor/l2norm.cu
-                            multi_tensor/scale.cu
-                            multi_tensor/sgd.cu
-                            fused_attn/flash_attn.cu
-                            fused_attn/context_parallel.cu
-                            fused_attn/kv_cache.cu
-                            PROPERTIES
-                            COMPILE_OPTIONS "--use_fast_math")
+set(nvte_sources_with_fast_math)
+list(APPEND nvte_sources_with_fast_math fused_softmax/scaled_masked_softmax.cu
+                                        fused_softmax/scaled_upper_triang_masked_softmax.cu
+                                        fused_softmax/scaled_aligned_causal_masked_softmax.cu
+                                        multi_tensor/adam.cu
+                                        multi_tensor/compute_scale.cu
+                                        multi_tensor/l2norm.cu
+                                        multi_tensor/scale.cu
+                                        multi_tensor/sgd.cu
+                                        fused_attn/flash_attn.cu
+                                        fused_attn/context_parallel.cu
+                                        fused_attn/kv_cache.cu)
+
 option(NVTE_BUILD_ACTIVATION_WITH_FAST_MATH "Compile activation kernels with --use_fast_math option" OFF)
 if (NVTE_BUILD_ACTIVATION_WITH_FAST_MATH)
-  set_source_files_properties(activation/gelu.cu
-                              activation/relu.cu
-                              activation/swiglu.cu
-                              util/cast.cu
-                              PROPERTIES
-                              COMPILE_OPTIONS "--use_fast_math")
+  list(APPEND nvte_sources_with_fast_math activation/gelu.cu
+                                          activation/relu.cu
+                                          activation/swiglu.cu
+                                          util/cast.cu)
 endif()
+
+foreach(cuda_source IN LISTS nvte_sources_with_fast_math)
+  set_property(
+    SOURCE ${cuda_source}
+    APPEND
+    PROPERTY
+    COMPILE_OPTIONS "--use_fast_math")
+endforeach()
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
 
diff --git a/transformer_engine/common/hadamard_transform/hadamard_transform_cast_fusion.cu b/transformer_engine/common/hadamard_transform/hadamard_transform_cast_fusion.cu
index ce191b5ffd..263a32623e 100644
--- a/transformer_engine/common/hadamard_transform/hadamard_transform_cast_fusion.cu
+++ b/transformer_engine/common/hadamard_transform/hadamard_transform_cast_fusion.cu
@@ -97,22 +97,23 @@ cutlass::Array<cutlass::float_e2m1_t, 8>
 StochasticNumericConverterBase(cutlass::Array<float, 8> const &input, cutlass::Array<uint32_t, 2> const &rbits) {
   using result_type = cutlass::Array<cutlass::float_e2m1_t, 8>;
   result_type output;
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  auto output_ptr = reinterpret_cast<uint16_t *>(&output);
-  asm volatile( \
-      "{\n" \
-      "cvt.rs.satfinite.e2m1x4.f32   %0, {%5, %4, %3, %2}, %10;\n" \
-      "cvt.rs.satfinite.e2m1x4.f32   %1, {%9, %8, %7, %6}, %11;\n" \
-      "}" \
-      : "=h"(output_ptr[0]),
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    auto output_ptr = reinterpret_cast<uint16_t *>(&output);
+    asm volatile( \
+        "{\n" \
+        "cvt.rs.satfinite.e2m1x4.f32   %0, {%5, %4, %3, %2}, %10;\n" \
+        "cvt.rs.satfinite.e2m1x4.f32   %1, {%9, %8, %7, %6}, %11;\n" \
+        "}" \
+        : "=h"(output_ptr[0]),
         "=h"(output_ptr[1])
-      : "f"(input[0]), "f"(input[1]), "f"(input[2]), "f"(input[3]),
+        : "f"(input[0]), "f"(input[1]), "f"(input[2]), "f"(input[3]),
         "f"(input[4]), "f"(input[5]), "f"(input[6]), "f"(input[7]),
         "r"(rbits[0]), "r"(rbits[1]));
-#else
-  NVTE_DEVICE_ERROR("FP4 cvt PTX instructions are architecture-specific. "
-                    "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  } else {
+    NVTE_DEVICE_ERROR("FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
   return output;
 }
 
diff --git a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
index eced2c4bb6..fed18c51f8 100644
--- a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
@@ -264,48 +264,50 @@ __device__ __forceinline__ size_t scale_factor_swizzled_offset(size_t row_idx, s
 
 __device__ __forceinline__ __nv_fp4x4_e2m1 cvt_fp32_to_fp4_4x_with_stochastic_rounding(
     const float2 in01, const float2 in23, const uint32_t rbits) {
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  uint16_t out_4x;
-  asm volatile(
-      "{\n"
-      "cvt.rs.satfinite.e2m1x4.f32 %0, {%3, %4, %1, %2}, %5; \n\t"
-      "}"
-      : "=h"(out_4x)
-      : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x), "r"(rbits));
-  return *reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x);
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-  uint16_t dummy = 0;
-  return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    uint16_t out_4x;
+    asm volatile(
+        "{\n"
+        "cvt.rs.satfinite.e2m1x4.f32 %0, {%3, %4, %1, %2}, %5; \n\t"
+        "}"
+        : "=h"(out_4x)
+        : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x), "r"(rbits));
+    return *reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x);
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt.rs PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+    uint16_t dummy = 0;
+    return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
+  }
 }
 
 __device__ __forceinline__ __nv_fp4x4_e2m1 cvt_fp32_to_fp4_4x_with_rn(const float2 in01,
                                                                       const float2 in23,
                                                                       const uint32_t rbits) {
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  // NOTE: rbits unused for rn.
-  uint32_t out_4x;  // Only need 16 bit. Using 32 bit container for packing.
-  asm volatile(
-      "{\n"
-      ".reg.b8 f0; \n\t"
-      ".reg.b8 f1; \n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f0, %1, %2;\n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f1, %3, %4;\n\t"
-      "mov.b32 %0, {f0, f1, f0, f1};\n\t"
-      "}"
-      : "=r"(out_4x)
-      : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x));
-  return reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x)[0];
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-  uint16_t dummy = 0;
-  return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_fp4 = ARCH_BLACKWELL_FAMILY;
+  if constexpr (has_fp4) {
+    // NOTE: rbits unused for rn.
+    uint32_t out_4x;  // Only need 16 bit. Using 32 bit container for packing.
+    asm volatile(
+        "{\n"
+        ".reg.b8 f0; \n\t"
+        ".reg.b8 f1; \n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f0, %1, %2;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f1, %3, %4;\n\t"
+        "mov.b32 %0, {f0, f1, f0, f1};\n\t"
+        "}"
+        : "=r"(out_4x)
+        : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x));
+    return reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x)[0];
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+    uint16_t dummy = 0;
+    return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
+  }
 }
 
 template <bool kApplyStochasticRounding>
diff --git a/transformer_engine/common/util/nvfp4_transpose.cuh b/transformer_engine/common/util/nvfp4_transpose.cuh
index 712b557c5d..45fa29f0e9 100644
--- a/transformer_engine/common/util/nvfp4_transpose.cuh
+++ b/transformer_engine/common/util/nvfp4_transpose.cuh
@@ -15,10 +15,9 @@
 #include <cudaTypedefs.h>
 #include <cuda_runtime.h>
 
-#if CUDA_VERSION > 12080
+#if FP4_TYPE_SUPPORTED
 #include <cuda_fp4.h>
-#endif  // CUDA_VERSION > 12080
-
+#endif  // FP4_TYPE_SUPPORTED
 #include <cfloat>
 
 #include "../common.h"
@@ -30,7 +29,7 @@
 
 namespace transformer_engine {
 
-#if CUDA_VERSION > 12080
+#if FP4_TYPE_SUPPORTED
 namespace nvfp4_transpose {
 
 using RNG = decltype(curanddx::Generator<curanddx::philox4_32>() + curanddx::PhiloxRounds<10>() +
@@ -152,89 +151,89 @@ __device__ __forceinline__ uint32_t get_rbits(RNG &rng, uint4 &random_uint4, int
   return rbits;
 }
 
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-
 __device__ __forceinline__ fp4e2m1x4 mul_cvt_bf16_to_fp4_4x_with_stochastic_rounding(
     const uint64_t in_4x, const float2 scale, const uint32_t rbits) {
   uint16_t out_4x = 0;
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b16 v0_bf16; \n\t"
-      ".reg.b16 v1_bf16; \n\t"
-      ".reg.b16 v2_bf16; \n\t"
-      ".reg.b16 v3_bf16; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
-      "cvt.f32.bf16 v0, v0_bf16; \n\t"
-      "cvt.f32.bf16 v1, v1_bf16; \n\t"
-      "cvt.f32.bf16 v2, v2_bf16; \n\t"
-      "cvt.f32.bf16 v3, v3_bf16; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %3; \n\t"  // mind the shuffled elements order
-      "}"
-      : "=h"(out_4x)
-      : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b16 v0_bf16; \n\t"
+        ".reg.b16 v1_bf16; \n\t"
+        ".reg.b16 v2_bf16; \n\t"
+        ".reg.b16 v3_bf16; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
+        "cvt.f32.bf16 v0, v0_bf16; \n\t"
+        "cvt.f32.bf16 v1, v1_bf16; \n\t"
+        "cvt.f32.bf16 v2, v2_bf16; \n\t"
+        "cvt.f32.bf16 v3, v3_bf16; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %3; \n\t"  // mind the shuffled elements order
+        "}"
+        : "=h"(out_4x)
+        : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
   return *reinterpret_cast<fp4e2m1x4 *>(&out_4x);
 }
 
 __device__ __forceinline__ fp4e2m1x4 mul_cvt_bf16_to_fp4_4x_with_rn(const uint64_t in_4x,
                                                                     const float2 scale,
                                                                     const uint32_t rbits) {
-  // NOTE: rbits unused for rn.
+  constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
   uint32_t out_4x = 0;  // Only need 16 bit. Using 32 bit container for packing.
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b16 v0_bf16; \n\t"
-      ".reg.b16 v1_bf16; \n\t"
-      ".reg.b16 v2_bf16; \n\t"
-      ".reg.b16 v3_bf16; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      ".reg.b8 f0; \n\t"
-      ".reg.b8 f1; \n\t"
-      "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
-      "cvt.f32.bf16 v0, v0_bf16; \n\t"
-      "cvt.f32.bf16 v1, v1_bf16; \n\t"
-      "cvt.f32.bf16 v2, v2_bf16; \n\t"
-      "cvt.f32.bf16 v3, v3_bf16; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
-      "mov.b32 %0, {f0, f1, f0, f1};\n\t"
-      "}"
-      : "=r"(out_4x)
-      : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  if constexpr (is_blackwell) {
+    // NOTE: rbits unused for rn.
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b16 v0_bf16; \n\t"
+        ".reg.b16 v1_bf16; \n\t"
+        ".reg.b16 v2_bf16; \n\t"
+        ".reg.b16 v3_bf16; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        ".reg.b8 f0; \n\t"
+        ".reg.b8 f1; \n\t"
+        "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
+        "cvt.f32.bf16 v0, v0_bf16; \n\t"
+        "cvt.f32.bf16 v1, v1_bf16; \n\t"
+        "cvt.f32.bf16 v2, v2_bf16; \n\t"
+        "cvt.f32.bf16 v3, v3_bf16; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
+        "mov.b32 %0, {f0, f1, f0, f1};\n\t"
+        "}"
+        : "=r"(out_4x)
+        : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
   return reinterpret_cast<fp4e2m1x4 *>(&out_4x)[0];
 }
 
@@ -252,34 +251,35 @@ __device__ __forceinline__ fp4e2m1x4 mul_cvt_bf16_to_fp4_4x(const uint64_t in_4x
 __device__ __forceinline__ fp4e2m1x4 mul_cvt_fp32_to_fp4_4x_with_stochastic_rounding(
     const float2 in01, const float2 in23, const float2 scale, const uint32_t rbits) {
   uint16_t out_4x = 0;
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      "mov.b64 {v0, v1} , %1; \n\t"
-      "mov.b64 {v2, v3} , %2; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %4; \n\t"  // mind the shuffled elements order
-      "}"
-      : "=h"(out_4x)
-      : "l"(reinterpret_cast<const uint64_t &>(in01)),
-        "l"(reinterpret_cast<const uint64_t &>(in23)),
-        "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        "mov.b64 {v0, v1} , %1; \n\t"
+        "mov.b64 {v2, v3} , %2; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %4; \n\t"  // mind the shuffled elements order
+        "}"
+        : "=h"(out_4x)
+        : "l"(reinterpret_cast<const uint64_t &>(in01)),
+          "l"(reinterpret_cast<const uint64_t &>(in23)),
+          "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
   return *reinterpret_cast<fp4e2m1x4 *>(&out_4x);
 }
 
@@ -287,40 +287,41 @@ __device__ __forceinline__ fp4e2m1x4 mul_cvt_fp32_to_fp4_4x_with_rn(const float2
                                                                     const float2 in23,
                                                                     const float2 scale,
                                                                     const uint32_t rbits) {
-  // NOTE: rbits unused for rn.
+  constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
   uint32_t out_4x = 0;  // Only need 16 bit. Using 32 bit container for packing.
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      ".reg.b8 f0; \n\t"
-      ".reg.b8 f1; \n\t"
-      "mov.b64 {v0, v1} , %1; \n\t"
-      "mov.b64 {v2, v3} , %2; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
-      "mov.b32 %0, {f0, f1, f0, f1};\n\t"
-      "}"
-      : "=r"(out_4x)
-      : "l"(reinterpret_cast<const uint64_t &>(in01)),
-        "l"(reinterpret_cast<const uint64_t &>(in23)),
-        "l"(reinterpret_cast<const uint64_t &>(scale)));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  if constexpr (is_blackwell) {
+    // NOTE: rbits unused for rn.
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        ".reg.b8 f0; \n\t"
+        ".reg.b8 f1; \n\t"
+        "mov.b64 {v0, v1} , %1; \n\t"
+        "mov.b64 {v2, v3} , %2; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
+        "mov.b32 %0, {f0, f1, f0, f1};\n\t"
+        "}"
+        : "=r"(out_4x)
+        : "l"(reinterpret_cast<const uint64_t &>(in01)),
+          "l"(reinterpret_cast<const uint64_t &>(in23)),
+          "l"(reinterpret_cast<const uint64_t &>(scale)));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
   return reinterpret_cast<fp4e2m1x4 *>(&out_4x)[0];
 }
 
@@ -335,8 +336,6 @@ __device__ __forceinline__ fp4e2m1x4 mul_cvt_fp32_to_fp4_4x(const float2 in01, c
   }
 }
 
-#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-
 template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &),
           typename IType, bool USE_STOCHASTIC_ROUNDING, bool RETURN_TRANSPOSE>
 __global__ void __launch_bounds__(THREADS_NUM)
@@ -1380,18 +1379,13 @@ __global__ void __launch_bounds__(THREADS_NUM)
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 }  // namespace nvfp4_transpose
-#endif  // CUDA_VERSION > 12080
-
-// Compile-time flag to choose kernel variant
-#ifndef USE_2D_NVFP4_KERNEL
-#define USE_2D_NVFP4_KERNEL 0
-#endif
+#endif  // FP4_TYPE_SUPPORTED
 
 template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &),
           bool use_2d_quantization>
 void nvfp4_quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
                               const QuantizationConfig *quant_config, cudaStream_t stream) {
-#if CUDA_VERSION > 12080
+#if FP4_TYPE_SUPPORTED
   bool use_stochastic_rounding = quant_config ? quant_config->stochastic_rounding : false;
 
   // If transposed output is allocated, return the transposed data. Otherwise, it's not necesary to
@@ -1509,7 +1503,7 @@ void nvfp4_quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *o
       }););
 #else
   NVTE_ERROR("FP4 support requires CUDA 12.8+, but compile-time CUDA version is ", CUDA_VERSION);
-#endif  // CUDA_VERSION > 12080
+#endif  // FP4_TYPE_SUPPORTED
 }
 }  // namespace transformer_engine
 
diff --git a/transformer_engine/common/util/ptx.cuh b/transformer_engine/common/util/ptx.cuh
index 85717afdf2..aeac2b4a2c 100644
--- a/transformer_engine/common/util/ptx.cuh
+++ b/transformer_engine/common/util/ptx.cuh
@@ -18,44 +18,165 @@
 #include <cuda_fp4.h>
 #endif  // CUDA_VERSION >= 12080
 
+#include "common/utils.cuh"
+
 namespace transformer_engine {
+
 namespace ptx {
 
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+template <int N>
+struct ArchSpecific {
+  constexpr static int id = N * 10;
+
+  template <int CurrentArch, int ArchSpecific, int FamilySpecific>
+  constexpr static bool compatible() {
+    if constexpr (CurrentArch == id) {
+      static_assert(ArchSpecific == CurrentArch,
+                    "Compiled for the generic architecture, while utilizing arch-specific "
+                    "features. Please compile for smXXXa architecture instead of smXXX "
+                    "architecture.");
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+template <int N>
+struct FamilySpecific {
+  constexpr static int id = N * 10;
+
+  template <int CurrentArch, int ArchSpecific, int FamilySpecific>
+  constexpr static bool compatible() {
+    if constexpr ((CurrentArch / 100) == (id / 100)) {
+      static_assert(FamilySpecific == CurrentArch,
+                    "Compiled for the generic architecture, while utilizing family-specific "
+                    "features. Please compile for smXXXf architecture instead of smXXX "
+                    "architecture.");
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+template <int Arch, int ArchSpecific, int FamilySpecific, class T, class... U>
+constexpr bool is_supported_arch() {
+  if constexpr (T::template compatible<Arch, ArchSpecific, FamilySpecific>()) {
+    return true;
+  } else if constexpr (sizeof...(U) != 0) {
+    return is_supported_arch<Arch, ArchSpecific, FamilySpecific, U...>();
+  } else {
+    return false;
+  }
+}
+
+#if CUDA_VERSION < 12090
+#if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
+#define __CUDA_ARCH_SPECIFIC__ 900
+#define __CUDA_ARCH_FAMILY_SPECIFIC__ 900
+#endif
+#if __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
+#define __CUDA_ARCH_SPECIFIC__ 1000
+#define __CUDA_ARCH_FAMILY_SPECIFIC__ 1000
+#endif
+#if __CUDA_ARCH_HAS_FEATURE__(SM101_ALL)
+#define __CUDA_ARCH_SPECIFIC__ 1010
+#define __CUDA_ARCH_FAMILY_SPECIFIC__ 1010
+#endif
+#if __CUDA_ARCH_HAS_FEATURE__(SM120_ALL)
+#define __CUDA_ARCH_SPECIFIC__ 1200
+#define __CUDA_ARCH_FAMILY_SPECIFIC__ 1200
+#endif
+#endif
+
+#ifdef __CUDA_ARCH__
+#define __NVTE_CURRENT_ARCH__ constexpr int current_arch = __CUDA_ARCH__;
+#else
+#define __NVTE_CURRENT_ARCH__ constexpr int current_arch = 0;
+#endif
+
+#ifdef __CUDA_ARCH_SPECIFIC__
+#define __NVTE_ARCH_SPECIFIC__ constexpr int ArchSpecific = __CUDA_ARCH_SPECIFIC__;
+#else
+#define __NVTE_ARCH_SPECIFIC__ constexpr int ArchSpecific = 0;
+#endif
+
+#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__
+#define __NVTE_ARCH_FAMILY_SPECIFIC__ constexpr int FamilySpecific = __CUDA_ARCH_FAMILY_SPECIFIC__;
+#else
+#define __NVTE_ARCH_FAMILY_SPECIFIC__ constexpr int FamilySpecific = 0;
+#endif
+
+#define NVTE_CUDA_ARCH_MATCHES(...)                                                               \
+  [&] {                                                                                           \
+    __NVTE_CURRENT_ARCH__                                                                         \
+    __NVTE_ARCH_SPECIFIC__                                                                        \
+    __NVTE_ARCH_FAMILY_SPECIFIC__                                                                 \
+    return transformer_engine::ptx::is_supported_arch<current_arch, ArchSpecific, FamilySpecific, \
+                                                      __VA_ARGS__>();                             \
+  }();
+
+#define ARCH_BLACKWELL_FAMILY                                                \
+  NVTE_CUDA_ARCH_MATCHES(ptx::FamilySpecific<100>, ptx::FamilySpecific<110>, \
+                         ptx::FamilySpecific<120>)
+#define ARCH_HAS_STOCHASTIC_ROUNDING \
+  NVTE_CUDA_ARCH_MATCHES(ptx::ArchSpecific<100>, ptx::ArchSpecific<103>)
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
 __device__ __forceinline__ void mbarrier_init(uint64_t *mbar, const uint32_t count) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   asm volatile("mbarrier.init.shared.b64 [%0], %1;" ::"r"(mbar_ptr), "r"(count) : "memory");
+#else
+  NVTE_DEVICE_ERROR("mbarrier_init is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
 __device__ __forceinline__ void mbarrier_invalid(uint64_t *mbar) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(mbar_ptr) : "memory");
+#else
+  NVTE_DEVICE_ERROR("mbarrier_invalid is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 __device__ __forceinline__ void mbarrier_arrive(uint64_t *mbar) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   asm volatile("mbarrier.arrive.shared.b64 _, [%0];" ::"r"(mbar_ptr) : "memory");
+#else
+  NVTE_DEVICE_ERROR("mbarrier_arrive is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
 __device__ __forceinline__ void mbarrier_arrive_expect_tx(uint64_t *mbar, const uint32_t tx_count) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" ::"r"(mbar_ptr), "r"(tx_count)
                : "memory");
+#else
+  NVTE_DEVICE_ERROR("mbarrier_arrive_expect_tx is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void fence_mbarrier_init_release_cluster() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile("fence.mbarrier_init.release.cluster;");
+#else
+  NVTE_DEVICE_ERROR("fence_mbarrier_init_release_cluster is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 // global -> shared::cluster
 __device__ __forceinline__ void cp_async_bulk_tensor_1d_global_to_shared(
     uint64_t *dst_shmem, const uint64_t *src_global_ptr, const uint32_t size, uint64_t *mbar) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t dst_shmem_ptr = __cvta_generic_to_shared(dst_shmem);
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   // triggers async copy, i.e. the thread continues until wait() on mbarrier
@@ -67,6 +188,9 @@ __device__ __forceinline__ void cp_async_bulk_tensor_1d_global_to_shared(
       ".mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ::"r"(dst_shmem_ptr),
       "l"(src_global_ptr), "r"(size), "r"(mbar_ptr)
       : "memory");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_tensor_1d_global_to_shared is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
@@ -74,6 +198,7 @@ __device__ __forceinline__ void cp_async_bulk_tensor_1d_global_to_shared(
 __device__ __forceinline__ void cp_async_bulk_tensor_2d_global_to_shared(
     uint64_t *dst_shmem, const uint64_t *tensor_map_ptr, const uint32_t offset_x,
     const uint32_t offset_y, uint64_t *mbar) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t dst_shmem_ptr = __cvta_generic_to_shared(dst_shmem);
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   // triggers async copy, i.e. the thread continues until wait() on mbarrier
@@ -85,9 +210,13 @@ __device__ __forceinline__ void cp_async_bulk_tensor_2d_global_to_shared(
       ".mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" ::"r"(dst_shmem_ptr),
       "l"(tensor_map_ptr), "r"(offset_x), "r"(offset_y), "r"(mbar_ptr)
       : "memory");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_tensor_2d_global_to_shared is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ bool mbarrier_try_wait_parity(uint32_t mbar_ptr, const uint32_t parity) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t waitComplete;
   asm volatile(
       "{\n\t .reg .pred P_OUT; \n\t"
@@ -98,15 +227,21 @@ __device__ __forceinline__ bool mbarrier_try_wait_parity(uint32_t mbar_ptr, cons
       : "r"(mbar_ptr), "r"(parity)
       : "memory");
   return static_cast<bool>(waitComplete);
+#else
+  NVTE_DEVICE_ERROR("mbarrier_try_wait_parity is only supported on SM 10.0+.");
+#endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  return true;
 }
 
 __device__ __forceinline__ void mbarrier_wait_parity(uint64_t *mbar, const uint32_t parity) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   uint32_t mbar_ptr = __cvta_generic_to_shared(mbar);
   while (!mbarrier_try_wait_parity(mbar_ptr, parity)) {
   }
-}
-
+#else
+  NVTE_DEVICE_ERROR("mbarrier_wait_parity is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+}
 
 constexpr uint32_t FP32_MANTISSA_BITS = 23;
 constexpr uint32_t FP32_EXPONENT_BIAS = 127;
@@ -121,55 +256,53 @@ __device__ __forceinline__ float exp2f(e8m0_t biased_exp) {
   return __int_as_float(biased_exp << FP32_MANTISSA_BITS);
 }
 
-#define CUDA_ARCH_HAS_FEATURE_SM10X_ALL                                                \
-  ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
-   (__CUDA_ARCH_HAS_FEATURE__(SM103_ALL)))
-
 __device__ __forceinline__ e8m0_t float_to_e8m0(float val) {
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-
-  uint16_t out;
-  asm volatile(
-      "{\n"
-      "cvt.rp.satfinite.ue8m0x2.f32  %0, 0.0, %1;\n"
-      "}"
-      : "=h"(out)
-      : "f"(val));
-  return *reinterpret_cast<e8m0_t *>(&out);
-#else
-  // TODO: nan/inf needs to be set for any value
-  // of nan/inf in input not just amax.
-  if (isnan(val)) {
-    return 0xFF;
-  }
-  if (isinf(val)) {
-    return 0xFE;
-  }
-  if (val == 0.0f) {
-    return 0x00;
-  }
-  uint32_t val_u32 = *reinterpret_cast<uint32_t *>(&val);
-  e8m0_t exponent = (val_u32 >> FP32_MANTISSA_BITS);
-  uint32_t mantissa = val_u32 & 0x7FFFFF;
-  // Round up exponent and deal with satfinite.
-  if ((mantissa > 0 && exponent != 0xFE) && !(exponent == 0 && mantissa <= 0x400000)) {
-    ++exponent;
+  constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
+  if constexpr (is_blackwell) {
+    uint16_t out;
+    asm volatile(
+        "{\n"
+        "cvt.rp.satfinite.ue8m0x2.f32  %0, 0.0, %1;\n"
+        "}"
+        : "=h"(out)
+        : "f"(val));
+    return *reinterpret_cast<e8m0_t *>(&out);
+  } else {
+    // TODO: nan/inf needs to be set for any value
+    // of nan/inf in input not just amax.
+    if (isnan(val)) {
+      return 0xFF;
+    }
+    if (isinf(val)) {
+      return 0xFE;
+    }
+    if (val == 0.0f) {
+      return 0x00;
+    }
+    uint32_t val_u32 = *reinterpret_cast<uint32_t *>(&val);
+    e8m0_t exponent = (val_u32 >> FP32_MANTISSA_BITS);
+    uint32_t mantissa = val_u32 & 0x7FFFFF;
+    // Round up exponent and deal with satfinite.
+    if ((mantissa > 0 && exponent != 0xFE) && !(exponent == 0 && mantissa <= 0x400000)) {
+      ++exponent;
+    }
+    return exponent;
   }
-  return exponent;
-#endif
 }
 
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 // shared::cta -> global
 __device__ __forceinline__ void cp_async_bulk_tensor_1d_shared_to_global(uint64_t *dst_global_ptr,
                                                                          const uint64_t *src_shmem,
                                                                          const uint32_t size) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   uint32_t src_shmem_ptr = __cvta_generic_to_shared(src_shmem);
   asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" ::"l"(dst_global_ptr),
                "r"(src_shmem_ptr), "r"(size)
                : "memory");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_tensor_1d_shared_to_global is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
@@ -177,51 +310,93 @@ __device__ __forceinline__ void cp_async_bulk_tensor_1d_shared_to_global(uint64_
 __device__ __forceinline__ void cp_async_bulk_tensor_2d_shared_to_global(
     const uint64_t *tensor_map_ptr, const uint32_t offset_x, const uint32_t offset_y,
     uint64_t *src_shmem) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   uint32_t src_shmem_ptr = __cvta_generic_to_shared(src_shmem);
   asm volatile("cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%1, %2}], [%3];" ::"l"(
                    tensor_map_ptr),
                "r"(offset_x), "r"(offset_y), "r"(src_shmem_ptr)
                : "memory");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_tensor_2d_shared_to_global is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
 __device__ __forceinline__ void cp_async_bulk_wait_group() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group 0;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_wait_group is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
 template <size_t W>
 __device__ __forceinline__ void cp_async_bulk_wait_group_read() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group.read 0;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_wait_group_read is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 template <>
 __device__ __forceinline__ void cp_async_bulk_wait_group_read<0>() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group.read 0;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_wait_group_read is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 template <>
 __device__ __forceinline__ void cp_async_bulk_wait_group_read<1>() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group.read 1;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_wait_group_read is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 template <>
 __device__ __forceinline__ void cp_async_bulk_wait_group_read<2>() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group.read 2;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_wait_group_read is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 template <>
 __device__ __forceinline__ void cp_async_bulk_wait_group_read<4>() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group.read 4;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_wait_group_read is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
 __device__ __forceinline__ void cp_async_bulk_commit_group() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.commit_group;");
+#else
+  NVTE_DEVICE_ERROR("cp_async_bulk_commit_group is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 // Proxy fence (bi-directional):
-__device__ __forceinline__ void fence_proxy_async() { asm volatile("fence.proxy.async;"); }
+__device__ __forceinline__ void fence_proxy_async() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  asm volatile("fence.proxy.async;");
+#else
+  NVTE_DEVICE_ERROR("fence_proxy_async is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
 
 __device__ __forceinline__ void fence_proxy_async_shared_cta() {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("fence.proxy.async.shared::cta;");
+#else
+  NVTE_DEVICE_ERROR("fence_proxy_async_shared_cta is only supported on SM 9.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }
 
 template <typename T>
@@ -282,15 +457,6 @@ static_assert(sizeof(fp4e2m1x2) == 1);
 static_assert(sizeof(fp4e2m1x4) == 2);
 #endif  // CUDA_VERSION >= 12080
 
-// cvt.rn.satfinite.e2m1x2.f32 d, a, b;  // Convert two FP32 values to two packed e2m1
-
-// cvt.rn.satfinite{.relu}.{e2m1x2/e2m3x2/e3m2x2/ue8m0x2}.f32 introduced in PTX ISA version 8.6.
-
-// vt.rn.satfinite{.relu}.{e2m1x2/e2m3x2/e3m2x2/ue8m0x2}.f32 is supported on following architectures:
-// sm_100a
-// sm_101a
-// sm_120a
-
 // When converting to .e2m1x2 data formats, the destination operand d has .b8 type.
 // When converting two .f32 inputs to .e2m1x2, each input is converted to the specified format,
 // and the converted values are packed in the destination operand d such that the value
@@ -313,6 +479,7 @@ __device__ __forceinline__ void mul_cvt_4x(fp4e2m1x4 &out, const Tx2 &in01, cons
 // SIMD like "Fused" cast + multiplication (x2)
 __device__ __forceinline__ void mul_cvt_2x(fp8e4m3x2 &out, const floatx2 &in,
                                            const floatx2 &scale) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile(
       "{\n"
       ".reg.b64 val_pair; \n\t"
@@ -325,10 +492,14 @@ __device__ __forceinline__ void mul_cvt_2x(fp8e4m3x2 &out, const floatx2 &in,
       : "=h"(reinterpret_cast<uint16_t &>(out))
       : "l"(reinterpret_cast<const uint64_t &>(in)),
         "l"(reinterpret_cast<const uint64_t &>(scale)));
+#else
+  NVTE_DEVICE_ERROR("mul_cvt_2x is only supported on SM 10.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void mul_cvt_2x(fp8e5m2x2 &out, const floatx2 &in,
                                            const floatx2 &scale) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile(
       "{\n"
       ".reg.b64 val_pair; \n\t"
@@ -341,9 +512,13 @@ __device__ __forceinline__ void mul_cvt_2x(fp8e5m2x2 &out, const floatx2 &in,
       : "=h"(reinterpret_cast<uint16_t &>(out))
       : "l"(reinterpret_cast<const uint64_t &>(in)),
         "l"(reinterpret_cast<const uint64_t &>(scale)));
+#else
+  NVTE_DEVICE_ERROR("mul_cvt_2x is only supported on SM 10.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void mul_cvt_2x(fp8e4m3x2 &out, const bf16x2 &in, const floatx2 &scale) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile(
       "{\n"
       ".reg.b64 val_pair_before; \n\t"
@@ -363,9 +538,13 @@ __device__ __forceinline__ void mul_cvt_2x(fp8e4m3x2 &out, const bf16x2 &in, con
       : "=h"(reinterpret_cast<uint16_t &>(out))
       : "r"(reinterpret_cast<const uint32_t &>(in)),
         "l"(reinterpret_cast<const uint64_t &>(scale)));
+#else
+  NVTE_DEVICE_ERROR("mul_cvt_2x is only supported on SM 10.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void mul_cvt_2x(fp8e5m2x2 &out, const bf16x2 &in, const floatx2 &scale) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile(
       "{\n"
       ".reg.b64 val_pair_before; \n\t"
@@ -385,9 +564,13 @@ __device__ __forceinline__ void mul_cvt_2x(fp8e5m2x2 &out, const bf16x2 &in, con
       : "=h"(reinterpret_cast<uint16_t &>(out))
       : "r"(reinterpret_cast<const uint32_t &>(in)),
         "l"(reinterpret_cast<const uint64_t &>(scale)));
+#else
+  NVTE_DEVICE_ERROR("mul_cvt_2x is only supported on SM 10.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void mul_cvt_2x(fp8e4m3x2 &out, const fp16x2 &in, const floatx2 &scale) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile(
       "{\n"
       ".reg.b64 val_pair_before; \n\t"
@@ -407,9 +590,13 @@ __device__ __forceinline__ void mul_cvt_2x(fp8e4m3x2 &out, const fp16x2 &in, con
       : "=h"(reinterpret_cast<uint16_t &>(out))
       : "r"(reinterpret_cast<const uint32_t &>(in)),
         "l"(reinterpret_cast<const uint64_t &>(scale)));
+#else
+  NVTE_DEVICE_ERROR("mul_cvt_2x is only supported on SM 10.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void mul_cvt_2x(fp8e5m2x2 &out, const fp16x2 &in, const floatx2 &scale) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   asm volatile(
       "{\n"
       ".reg.b64 val_pair_before; \n\t"
@@ -429,24 +616,33 @@ __device__ __forceinline__ void mul_cvt_2x(fp8e5m2x2 &out, const fp16x2 &in, con
       : "=h"(reinterpret_cast<uint16_t &>(out))
       : "r"(reinterpret_cast<const uint32_t &>(in)),
         "l"(reinterpret_cast<const uint64_t &>(scale)));
+#else
+  NVTE_DEVICE_ERROR("mul_cvt_2x is only supported on SM 10.0+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
 __device__ __forceinline__ void abs_max_2x(bf16x2 &dst, const bf16x2 &p1, const bf16x2 &p2) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
   asm volatile("max.xorsign.abs.bf16x2 %0, %1, %2;"
                : "=r"(reinterpret_cast<uint32_t &>(dst))
                : "r"(reinterpret_cast<const uint32_t &>(p1)),
                  "r"(reinterpret_cast<const uint32_t &>(p2)));
+#else
+  NVTE_DEVICE_ERROR("abs_max_2x is only supported on SM 8.9+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
 }
 
 __device__ __forceinline__ void abs_max_2x(fp16x2 &dst, const fp16x2 &p1, const fp16x2 &p2) {
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
   asm volatile("max.xorsign.abs.f16x2 %0, %1, %2;"
                : "=r"(reinterpret_cast<uint32_t &>(dst))
                : "r"(reinterpret_cast<const uint32_t &>(p1)),
                  "r"(reinterpret_cast<const uint32_t &>(p2)));
+#else
+  NVTE_DEVICE_ERROR("abs_max_2x is only supported on SM 8.9+.");
+#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
 }
 
-#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-
 }  // namespace ptx
 
 namespace {
@@ -464,6 +660,8 @@ __forceinline__ __device__ void initialize_barriers(uint64_t *mbar, const bool i
   }
   // Syncthreads so initialized barrier is visible to all threads.
   __syncthreads();
+#else
+  NVTE_DEVICE_ERROR("initialize_barriers is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
@@ -479,6 +677,8 @@ __forceinline__ __device__ void destroy_barriers(uint64_t *mbar, const bool is_m
       ptx::mbarrier_invalid(&mbar[iter]);
     }
   }
+#else
+  NVTE_DEVICE_ERROR("destroy_barriers is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
@@ -498,6 +698,8 @@ __forceinline__ __device__ void copy_1d_to_shared(void *dst, const void *src,
     // Other threads just arrive
     ptx::mbarrier_arrive(barrier);
   }
+#else
+  NVTE_DEVICE_ERROR("copy_1d_to_shared is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
@@ -517,6 +719,8 @@ __forceinline__ __device__ void copy_2d_to_shared(void *dst, const void *src, co
     // Other threads just arrive
     ptx::mbarrier_arrive(barrier);
   }
+#else
+  NVTE_DEVICE_ERROR("copy_2d_to_shared is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
@@ -543,6 +747,8 @@ __forceinline__ __device__ void copy_2d_to_sharedx2(void *dst, const void *src,
     // Other threads just arrive
     ptx::mbarrier_arrive(barrier);
   }
+#else
+  NVTE_DEVICE_ERROR("copy_2d_to_sharedx2 is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
@@ -572,6 +778,8 @@ __forceinline__ __device__ void copy_2d_to_sharedx3(
     // Other threads just arrive
     ptx::mbarrier_arrive(barrier);
   }
+#else
+  NVTE_DEVICE_ERROR("copy_2d_to_sharedx3 is only supported on SM 10.0+.");
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 
diff --git a/transformer_engine/common/utils.cuh b/transformer_engine/common/utils.cuh
index bc764ac746..2d37e9c85a 100644
--- a/transformer_engine/common/utils.cuh
+++ b/transformer_engine/common/utils.cuh
@@ -16,6 +16,7 @@
 #endif
 
 #if !defined(__CUDACC_RTC__)
+#include <cassert>
 #include <cstdint>
 #else
 // Importing C++ standard headers is a pain with NVRTC

From c4c185dbec1aab3627ab2ecffbc4c429d31f23c0 Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Fri, 24 Oct 2025 17:01:51 -0700
Subject: [PATCH 07/59] [PyTorch] Add max_logit support for MuonClip (#2195)

* add max_score for fused/unfused F16 non-CP

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* calculate max per head instead of max over all heads

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* fix fused attn max_score shape

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* revert FE to github

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update FE to 1.15.0-rc

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* fix merge

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* reduce ew kernels; fix causal masks; add more tests

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix to tests

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove logic for flash-attn

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* WIP: add CP support for p2p/a2a/all_gather

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor improvements of implementation/tests

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* WIP: add thd support

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* add thd to UnfusedDPA

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* fix lint

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* more fixes for lint

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update to FE 1.15

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* remove unneeded changes

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* disable unfused for thd + pad_between_seqs

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* minor fixes

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* disable thd for unfused until bug is fixed

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix all_gather

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* fix all gather

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* rename max_score to max_logit

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* fix all_gather

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* fix all_gather

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

* disable fused attn + thd

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

---------

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 3rdparty/cudnn-frontend                       |   2 +-
 .../attention/run_attention_with_cp.py        |  15 +-
 tests/pytorch/attention/test_attention.py     |  68 ++-
 .../attention/test_attention_with_cp.py       |   6 +-
 tests/pytorch/utils.py                        |   3 +
 .../common/fused_attn/fused_attn.cpp          |  80 ++--
 .../fused_attn_f16_arbitrary_seqlen.cu        | 410 ++++++++++++------
 .../fused_attn_f16_arbitrary_seqlen.h         |  46 +-
 .../common/fused_attn/fused_attn_fp8.cu       |   6 +-
 transformer_engine/common/fused_attn/utils.h  |   5 +-
 .../include/transformer_engine/fused_attn.h   |  79 ++--
 .../jax/csrc/extensions/attention.cpp         |  32 +-
 .../dot_product_attention/backends.py         |  69 ++-
 .../dot_product_attention/context_parallel.py |  79 +++-
 .../dot_product_attention.py                  |  15 +
 .../attention/dot_product_attention/utils.py  |  91 ++++
 .../pytorch/cpp_extensions/fused_attn.py      |  18 +
 transformer_engine/pytorch/csrc/extensions.h  |   4 +-
 .../pytorch/csrc/extensions/attention.cpp     |  25 +-
 19 files changed, 748 insertions(+), 305 deletions(-)

diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
index 80a8e4af4d..0b1577c8c8 160000
--- a/3rdparty/cudnn-frontend
+++ b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit 80a8e4af4d89d33a2c59d51fcf9fda1c9d368cd4
+Subproject commit 0b1577c8c83401237d601d0d0db5210506705396
diff --git a/tests/pytorch/attention/run_attention_with_cp.py b/tests/pytorch/attention/run_attention_with_cp.py
index 1edffaf486..5ed67c3d5e 100644
--- a/tests/pytorch/attention/run_attention_with_cp.py
+++ b/tests/pytorch/attention/run_attention_with_cp.py
@@ -248,6 +248,7 @@ def run_dpa_with_cp(
         attn_mask_type=config.attn_mask_type,
         window_size=config.window_size,
         softmax_type=config.softmax_type,
+        return_max_logit=config.return_max_logit,
     ).cuda()
     if config.softmax_type != "vanilla":
         core_attn.softmax_offset.requires_grad = True
@@ -308,6 +309,7 @@ def run_dpa_with_cp(
         fp8_context = autocast(enabled=True, recipe=fp8_recipe, amax_reduction_group=cp_comm_group)
     else:
         fp8_context = nullcontext()
+    max_logit = None
     with fp8_context:
         # q, k, v, out in FP8; dout in F16
         out = core_attn(
@@ -322,6 +324,8 @@ def run_dpa_with_cp(
             cu_seqlens_kv_padded=cu_seqlens_kv_padded,
             fp8_output=fp8_mha,
         )
+        if config.return_max_logit:
+            out, max_logit = out
         if fp8_bwd and fp8_mha:
             dout_fp8 = dout_quantizer(dout)
             out.backward(dout_fp8)
@@ -400,6 +404,7 @@ def run_dpa_with_cp(
         fp8_context = nullcontext()
 
     # run attention
+    max_logit_ = None
     with fp8_context:
         # q, k, v, out in FP8; dout in F16
         out_ = core_attn(
@@ -414,6 +419,8 @@ def run_dpa_with_cp(
             cu_seqlens_kv_padded=cu_seqlens_kv_padded,
             fp8_output=fp8_mha,
         )
+        if config.return_max_logit:
+            out_, max_logit_ = out_
         if fp8_bwd and fp8_mha:
             dout_fp8_ = dout_quantizer(dout_)
             out_.backward(dout_fp8_)
@@ -495,15 +502,15 @@ def run_dpa_with_cp(
                 )
 
     atol, rtol, rmse_tol = get_tols(config, dtype)
-    tensors_cp = [out_, dq_, dk_, dv_, d_softmax_offset_]
-    tensors_no_cp = [out, dq, dk, dv, d_softmax_offset]
-    names = ["out", "dq", "dk", "dv", "d_softmax_offset"]
+    tensors_cp = [out_, dq_, dk_, dv_, d_softmax_offset_, max_logit_]
+    tensors_no_cp = [out, dq, dk, dv, d_softmax_offset, max_logit]
+    names = ["out", "dq", "dk", "dv", "d_softmax_offset", "max_logit"]
     names_cp = [x + "_cp" for x in names]
     names_no_cp = [x + "_no_cp" for x in names]
     is_fp8 = dtype == "fp8"
     for i, t in enumerate(tensors_no_cp):
         if t is not None:
-            if "softmax_offset" not in names[i]:
+            if "softmax_offset" not in names[i] and "max_logit" not in names[i]:
                 if qkv_format == "bshd":
                     compare_and_assert(
                         t[:, 0],
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
index 7dc6caeb81..63b877e68f 100644
--- a/tests/pytorch/attention/test_attention.py
+++ b/tests/pytorch/attention/test_attention.py
@@ -130,6 +130,11 @@ def test_dot_product_attention(
     if config.window_size == (-1, -1) and swa:
         config.window_size = [2, 2]
     config.window_size = check_set_window_size(config.attn_mask_type, config.window_size)
+    qkv_format = qkv_layout.replace("3", "").replace("2", "").split("_")[0]
+    if qkv_format == "thd" and "padding" not in config.attn_mask_type:
+        config.attn_mask_type = (
+            "padding_" + config.attn_mask_type if config.attn_mask_type != "no_mask" else "padding"
+        )
 
     # Get backends
     is_training = True
@@ -171,7 +176,7 @@ def test_dot_product_attention(
 
     # UnfusedDotProductAttention backend
     if unfused_attn_supported:
-        unfused_attn_fwd, unfused_attn_bwd = _run_dot_product_attention(
+        unfused_attn_fwd, unfused_max_logit, unfused_attn_bwd = _run_dot_product_attention(
             dtype,
             config,
             "UnfusedDotProductAttention",
@@ -185,7 +190,7 @@ def test_dot_product_attention(
     # FusedAttention backend
     if fused_attn_supported:
         if len(fused_attn_backends) == 1:
-            fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention(
+            fused_attn_fwd, fused_max_logit, fused_attn_bwd = _run_dot_product_attention(
                 dtype,
                 config,
                 "FusedAttention",
@@ -197,7 +202,7 @@ def test_dot_product_attention(
             )
         if len(fused_attn_backends) == 2:
             os.environ["NVTE_FUSED_ATTN_BACKEND"] = "0"
-            fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention(
+            fused_attn_fwd, _, fused_attn_bwd = _run_dot_product_attention(
                 dtype,
                 config,
                 "FusedAttention",
@@ -208,7 +213,7 @@ def test_dot_product_attention(
                 is_training,
             )
             os.environ["NVTE_FUSED_ATTN_BACKEND"] = "1"
-            fused_attn_fwd_1, fused_attn_bwd_1 = _run_dot_product_attention(
+            fused_attn_fwd_1, _, fused_attn_bwd_1 = _run_dot_product_attention(
                 dtype,
                 config,
                 "FusedAttention",
@@ -221,7 +226,7 @@ def test_dot_product_attention(
 
     # FlashAttention backend
     if flash_attn_supported:
-        flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention(
+        flash_attn_fwd, _, flash_attn_bwd = _run_dot_product_attention(
             dtype,
             config,
             "FlashAttention",
@@ -242,6 +247,8 @@ def test_dot_product_attention(
     if unfused_attn_supported and fused_attn_supported:
         logging.info("[test_dot_product_attention]: unfused attn vs fused attn")
         torch.testing.assert_close(fused_attn_fwd, unfused_attn_fwd, **tols)
+        if config.return_max_logit:
+            torch.testing.assert_close(fused_max_logit, unfused_max_logit, **tols)
         for i, _ in enumerate(unfused_attn_bwd):
             torch.testing.assert_close(fused_attn_bwd[i], unfused_attn_bwd[i], **tols)
     if fused_attn_supported and flash_attn_supported:
@@ -265,6 +272,33 @@ def test_dpa_checkpoint(dtype, model_configs, model):
     test_dot_product_attention(dtype, model_configs, model, True, True, None, False, False)
 
 
+model_configs_max_logit = {
+    # test: ModelConfig(b, sq, hq, dqk)
+    "max_logit_1": ModelConfig(1, 2048, 24, 128, max_seqlen_kv=4096),
+    "max_logit_2": ModelConfig(2, 2048, 24, 128, attn_mask_type="causal"),
+    "max_logit_3": ModelConfig(2, 1, 16, 128, max_seqlen_kv=2048, attn_mask_type="padding_causal"),
+    "max_logit_4": ModelConfig(
+        8, 128, 16, 192, max_seqlen_kv=2048, attn_bias_type="post_scale_bias"
+    ),
+    "max_logit_5": ModelConfig(
+        8, 128, 16, 512, max_seqlen_kv=2048, attn_mask_type="causal", window_size=(20, 0)
+    ),
+    "max_logit_6": ModelConfig(8, 1, 16, 1024, max_seqlen_kv=2048),
+}
+
+
+@pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
+@pytest.mark.parametrize("dtype", param_types)
+@pytest.mark.parametrize("model_configs", [model_configs_max_logit])
+@pytest.mark.parametrize("model", model_configs_max_logit.keys())
+@pytest.mark.parametrize("qkv_layout", ["sbhd_sbhd_sbhd", "thd_thd_thd"])
+def test_dpa_max_logit(dtype, model_configs, model, qkv_layout):
+    """Test DotProductAttention module with checkpointing"""
+    config = model_configs[model]
+    config.return_max_logit = True
+    test_dot_product_attention(dtype, model_configs, model, False, True, qkv_layout, False, False)
+
+
 model_configs_softmax = {
     # test: ModelConfig(b, sq, hq, dqk)
     "softmax_1_0": ModelConfig(2, 2048, 64, 64, num_gqa_groups=8),
@@ -961,6 +995,8 @@ def _run_dot_product_attention(
             layout = layout.replace("d", "dqk")
         tensor_shape = [dim_to_num[j] for j in layout.split("_")]
         tensor = 0.1 * torch.randn(tensor_shape, dtype=dtype, device="cuda")
+        # tensor: with padding tokens
+        # tensor_orig: without padding tokens
         tensor_orig = tensor
         if qkv_format == "thd" and pad_between_seqs:
             tensor_orig = torch.Tensor([]).to(device="cuda", dtype=dtype)
@@ -1070,6 +1106,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
         layer_number=1,
         attention_type=config.attn_type,
         softmax_type=config.softmax_type,
+        return_max_logit=config.return_max_logit,
     ).to(dtype=dtype, device="cuda")
     if not is_training:
         block = block.eval()
@@ -1107,16 +1144,21 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
         alibi_slopes=alibi_slopes,
         fast_zero_fill=True,
     )
+    max_logit = None
+    if config.return_max_logit:
+        out, max_logit = out
     if is_training:
         out.backward(d_out)
+
     d_softmax_offset = None
     if is_training and config.softmax_type != "vanilla":
         d_softmax_offset = block.softmax_offset.grad
+
     if backend in ["FlashAttention", "UnfusedDotProductAttention"]:
         if is_training:
-            return out, (q.grad, k.grad, v.grad, d_softmax_offset)
+            return out, max_logit, (q.grad, k.grad, v.grad, d_softmax_offset)
         else:
-            return out, (None, None, None, d_softmax_offset)
+            return out, max_logit, (None, None, None, d_softmax_offset)
     if backend == "FusedAttention":
         if qkv_format == "thd" and pad_between_seqs:
             out_orig = torch.Tensor([]).to(device="cuda", dtype=dtype)
@@ -1145,14 +1187,18 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
                         [v_grad_orig, v.grad[valid_range_kv[0] : valid_range_kv[1]]], dim=0
                     )
             if is_training:
-                return out_orig, (q_grad_orig, k_grad_orig, v_grad_orig, d_softmax_offset)
+                return (
+                    out_orig,
+                    max_logit,
+                    (q_grad_orig, k_grad_orig, v_grad_orig, d_softmax_offset),
+                )
             else:
-                return out_orig, (None, None, None, d_softmax_offset)
+                return out_orig, max_logit, (None, None, None, d_softmax_offset)
         else:
             if is_training:
-                return out, (q.grad, k.grad, v.grad, d_softmax_offset)
+                return out, max_logit, (q.grad, k.grad, v.grad, d_softmax_offset)
             else:
-                return out, (None, None, None, d_softmax_offset)
+                return out, max_logit, (None, None, None, d_softmax_offset)
 
 
 model_configs_te_layer = {
diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
index 2c7f9d8578..e5c856acd8 100644
--- a/tests/pytorch/attention/test_attention_with_cp.py
+++ b/tests/pytorch/attention/test_attention_with_cp.py
@@ -137,8 +137,8 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
 
 model_configs_fused_attn = {
     # test: ModelConfig(b, sq, hq, dqk)
-    "cp_1_0": ModelConfig(2, 4096, 12, 128, attn_mask_type="causal"),  # MHA
-    "cp_1_1": ModelConfig(2, 4096, 12, 128),  # MHA
+    "cp_1_0": ModelConfig(2, 4096, 12, 128, attn_mask_type="causal", return_max_logit=True),  # MHA
+    "cp_1_1": ModelConfig(2, 4096, 12, 128, return_max_logit=True),  # MHA
     "cp_1_2": ModelConfig(
         2, 4096, 12, 128, attn_mask_type="causal", attn_bias_type="post_scale_bias"
     ),  # MHA
@@ -183,7 +183,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
 qkv_formats = ["bshd", "sbhd", "thd"]
 cp_comm_types = ["p2p", "all_gather", "a2a", "a2a+p2p"]
 if test_essential:
-    configs = ["cp_1_0", "cp_2_0", "cp_2_2", "cp_3_2", "cp_4_2"]
+    configs = ["cp_1_0", "cp_1_1", "cp_2_0", "cp_2_2", "cp_3_2", "cp_4_2"]
     model_configs_fused_attn = {k: model_configs_fused_attn[k] for k in configs}
     dtypes = ["bf16", "fp8"]
     qkv_formats = ["sbhd", "thd"]
diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py
index 72a1b3b534..485c739c03 100644
--- a/tests/pytorch/utils.py
+++ b/tests/pytorch/utils.py
@@ -205,6 +205,7 @@ def __init__(
         window_size: Tuple[int, int] = (-1, -1),
         context_parallel: bool = False,
         cp_comm_type: str = "p2p",
+        return_max_logit=False,
         total_requests: int = None,
         max_ctx_len: int = None,
         num_layers: int = 1,
@@ -233,6 +234,7 @@ def __init__(
         self.window_size = check_set_window_size(self.attn_mask_type, window_size)
         self.context_parallel = context_parallel
         self.cp_comm_type = cp_comm_type
+        self.return_max_logit = return_max_logit
         self.total_requests = total_requests
         self.max_ctx_len = max_ctx_len
         self.num_layers = num_layers
@@ -318,6 +320,7 @@ def test():
             is_training=is_training,
             inference_params=inference_params,
             softmax_type=config.softmax_type,
+            return_max_logit=config.return_max_logit,
         )
         (
             use_flash_attention,
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 77cd8d235a..f6ee37d4c5 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -138,7 +138,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
     float dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right) {
+    int64_t window_size_right, bool return_max_logit) {
   using namespace transformer_engine;
   NVTE_Fused_Attn_Backend backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
   const int device_id = cuda::current_device();
@@ -187,7 +187,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
       (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
       !requires_64bit_ragged_offset && (softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX) &&
       // 9.10.0: known bugs with SDPA FP8
-      (cudnn_runtime_version != 91000)) {
+      (cudnn_runtime_version != 91000) && !return_max_logit) {
     if (cudnn_runtime_version >= 8900) {
       backend = NVTE_Fused_Attn_Backend::NVTE_FP8;
     } else {
@@ -216,7 +216,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
          (qkv_layout == NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD)) &&
         ((window_size_left == -1) && (window_size_right == -1 || window_size_right == 0)) &&
         !requires_64bit_ragged_offset &&
-        (softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX)) {
+        (softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX) && !return_max_logit) {
       flag_m512 = true;
     }
     if (
@@ -418,8 +418,8 @@ void nvte_fused_attn_fwd_qkvpacked(const NVTETensor QKV, const NVTETensor Bias,
                                    const NVTETensor SoftmaxOffset, NVTETensor S, NVTETensor O,
                                    NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens,
                                    const NVTETensor cu_seqlens_padded, const NVTETensor rng_state,
-                                   size_t max_seqlen, bool is_training, float attn_scale,
-                                   float dropout, NVTE_QKV_Layout qkv_layout,
+                                   size_t max_seqlen, bool is_training, bool return_max_logit,
+                                   float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
                                    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
                                    NVTE_Softmax_Type softmax_type, int64_t window_size_left,
                                    int64_t window_size_right, NVTETensor workspace,
@@ -460,7 +460,7 @@ void nvte_fused_attn_fwd_qkvpacked(const NVTETensor QKV, const NVTETensor Bias,
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, QKV_type, QKV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h, h, max_seqlen, max_seqlen, d, d, window_size_left, window_size_right);
+      h, h, max_seqlen, max_seqlen, d, d, window_size_left, window_size_right, return_max_logit);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
 #if (CUDNN_VERSION >= 8901)
@@ -474,10 +474,10 @@ void nvte_fused_attn_fwd_qkvpacked(const NVTETensor QKV, const NVTETensor Bias,
   } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
 #if (CUDNN_VERSION >= 8900)
     fused_attn_arbitrary_seqlen_fwd_qkvpacked(
-        b, h, max_seqlen, d, t, is_training, attn_scale, dropout, qkv_layout, bias_type,
-        attn_mask_type, softmax_type, window_size_left, window_size_right, input_QKV, input_Bias,
-        input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens, input_cu_seqlens_padded,
-        input_rng_state, wkspace, stream, handle);
+        b, h, max_seqlen, d, t, is_training, return_max_logit, attn_scale, dropout, qkv_layout,
+        bias_type, attn_mask_type, softmax_type, window_size_left, window_size_right, input_QKV,
+        input_Bias, input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens,
+        input_cu_seqlens_padded, input_rng_state, wkspace, stream, handle);
 #else
     NVTE_ERROR(
         "cuDNN 8.9.0 is required for BF16/FP16 fused attention with arbitrary sequence length. \n");
@@ -544,7 +544,7 @@ void nvte_fused_attn_bwd_qkvpacked(const NVTETensor QKV, const NVTETensor O, con
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       true, QKV_type, QKV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h, h,
-      max_seqlen, max_seqlen, d, d, window_size_left, window_size_right);
+      max_seqlen, max_seqlen, d, d, window_size_left, window_size_right, false);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
 #if (CUDNN_VERSION >= 8901)
@@ -602,7 +602,7 @@ void nvte_fused_attn_fwd_kvpacked(
     const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded,
     const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k,
     const NVTETensor page_table_v, const NVTETensor rng_state, size_t max_seqlen_q,
-    size_t max_seqlen_kv, bool is_training, float attn_scale, float dropout,
+    size_t max_seqlen_kv, bool is_training, bool return_max_logit, float attn_scale, float dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
     NVTETensor workspace, cudaStream_t stream) {
@@ -680,7 +680,8 @@ void nvte_fused_attn_fwd_kvpacked(
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right);
+      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right,
+      return_max_logit);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
 #if (CUDNN_VERSION >= 8901)
@@ -695,12 +696,12 @@ void nvte_fused_attn_fwd_kvpacked(
 #if (CUDNN_VERSION >= 8903)
     fused_attn_arbitrary_seqlen_fwd_kvpacked(
         b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, t_q, t_kv, num_pages_k, num_pages_v,
-        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, is_training, attn_scale,
-        dropout, qkv_layout, bias_type, attn_mask_type, softmax_type, window_size_left,
-        window_size_right, input_Q, input_KV, input_Bias, input_SoftmaxOffset, output_O,
-        Aux_CTX_Tensors, input_cu_seqlens_q, input_cu_seqlens_kv, input_cu_seqlens_q_padded,
-        input_cu_seqlens_kv_padded, input_page_table_k, input_page_table_v, input_rng_state,
-        wkspace, stream, handle);
+        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, is_training,
+        return_max_logit, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
+        window_size_left, window_size_right, input_Q, input_KV, input_Bias, input_SoftmaxOffset,
+        output_O, Aux_CTX_Tensors, input_cu_seqlens_q, input_cu_seqlens_kv,
+        input_cu_seqlens_q_padded, input_cu_seqlens_kv_padded, input_page_table_k,
+        input_page_table_v, input_rng_state, wkspace, stream, handle);
 #else
     NVTE_ERROR(
         "cuDNN 8.9.3 is required for BF16/FP16 fused attention with arbitrary sequence length. \n");
@@ -777,7 +778,7 @@ void nvte_fused_attn_bwd_kvpacked(
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       true, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h_q,
-      h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right);
+      h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right, false);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
 #if (CUDNN_VERSION >= 8901)
@@ -832,18 +833,16 @@ void nvte_fused_attn_bwd_kvpacked(
   }
 }
 // NVTE fused attention FWD with separate Q, K and V
-void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETensor V,
-                         const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S,
-                         NVTETensor O, NVTETensorPack *Aux_CTX_Tensors,
-                         const NVTETensor cu_seqlens_q, const NVTETensor cu_seqlens_kv,
-                         const NVTETensor cu_seqlens_q_padded,
-                         const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k,
-                         const NVTETensor page_table_v, const NVTETensor rng_state,
-                         size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training,
-                         float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
-                         NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
-                         NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-                         int64_t window_size_right, NVTETensor workspace, cudaStream_t stream) {
+void nvte_fused_attn_fwd(
+    const NVTETensor Q, const NVTETensor K, const NVTETensor V, const NVTETensor Bias,
+    const NVTETensor SoftmaxOffset, NVTETensor S, NVTETensor O, NVTETensorPack *Aux_CTX_Tensors,
+    const NVTETensor cu_seqlens_q, const NVTETensor cu_seqlens_kv,
+    const NVTETensor cu_seqlens_q_padded, const NVTETensor cu_seqlens_kv_padded,
+    const NVTETensor page_table_k, const NVTETensor page_table_v, const NVTETensor rng_state,
+    size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, bool return_max_logit,
+    float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, NVTETensor workspace, cudaStream_t stream) {
   NVTE_API_CALL(nvte_flash_attn_fwd);
   using namespace transformer_engine;
   const Tensor *input_cu_seqlens_q = convertNVTETensorCheck(cu_seqlens_q);
@@ -913,7 +912,8 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right);
+      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right,
+      return_max_logit);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
 #if (CUDNN_VERSION >= 8901)
@@ -928,12 +928,12 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
 #if (CUDNN_VERSION >= 8900)
     fused_attn_arbitrary_seqlen_fwd(
         b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, t_q, t_kv, num_pages_k, num_pages_v,
-        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, is_training, attn_scale,
-        dropout, qkv_layout, bias_type, attn_mask_type, softmax_type, window_size_left,
-        window_size_right, input_Q, input_K, input_V, input_Bias, input_SoftmaxOffset, output_O,
-        Aux_CTX_Tensors, input_cu_seqlens_q, input_cu_seqlens_kv, input_cu_seqlens_q_padded,
-        input_cu_seqlens_kv_padded, input_page_table_k, input_page_table_v, input_rng_state,
-        wkspace, stream, handle);
+        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, is_training,
+        return_max_logit, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
+        window_size_left, window_size_right, input_Q, input_K, input_V, input_Bias,
+        input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens_q, input_cu_seqlens_kv,
+        input_cu_seqlens_q_padded, input_cu_seqlens_kv_padded, input_page_table_k,
+        input_page_table_v, input_rng_state, wkspace, stream, handle);
 #else
     NVTE_ERROR(
         "cuDNN 8.9.0 is required for BF16/FP16 fused attention with arbitrary sequence length. \n");
@@ -1008,7 +1008,7 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
 
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       true, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h_q,
-      h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right);
+      h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right, false);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
 #if (CUDNN_VERSION >= 8901)
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index ba0f845789..950ced61bb 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -53,10 +53,10 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
     int64_t max_b, int64_t max_t_q, int64_t max_t_kv, int64_t num_pages_k, int64_t num_pages_v,
     int64_t page_size_k, int64_t page_size_v, int64_t max_pages_per_seq_k,
     int64_t max_pages_per_seq_v, int64_t bias_b, int64_t bias_h, bool is_training,
-    float scaling_factor, float dropout_probability, NVTE_QKV_Layout layout,
+    bool return_max_logit, float scaling_factor, float dropout_probability, NVTE_QKV_Layout layout,
     NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
     int64_t window_size_left, int64_t window_size_right, void *devPtrQ, void *devPtrK,
-    void *devPtrV, void *devPtrBias, void *devPtrSoftmaxOffset, void *devPtrSoftmaxStats,
+    void *devPtrV, void *devPtrBias, void *devPtrSoftmaxOffset, void *devPtrS1, void *devPtrS2,
     void *devPtrO, void *devPtrDropoutSeed, void *devPtrDropoutOffset, void *devPtrCuSeqlensQ,
     void *devPtrCuSeqlensKV, void *devPtrPageTableK, void *devPtrPageTableV,
     void *devPtrSeqOffsetsQ, void *devPtrSeqOffsetsKV, cudnn_frontend::DataType_t tensorType,
@@ -102,36 +102,40 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
   }
 
   const DType ragged_offset_type = cudnn_runtime_version >= 90500 ? DType::kInt64 : DType::kInt32;
+  bool generate_stats = !return_max_logit;
   try {
-    FADescriptor_v1 descriptor{b,
-                               h,
-                               hg,
-                               s_q,
-                               s_kv,
-                               d_qk,
-                               d_v,
-                               num_pages_k,
-                               num_pages_v,
-                               page_size_k,
-                               page_size_v,
-                               max_pages_per_seq_k,
-                               max_pages_per_seq_v,
-                               bias_b,
-                               bias_h,
-                               scaling_factor,
-                               is_training,
-                               dropout_probability,
-                               layout,
-                               bias_type,
-                               mask_type,
-                               softmax_type,
-                               window_size_left,
-                               window_size_right,
-                               true,
-                               tensorType,
-                               cudnn_frontend::DataType_t::NOT_SET,
-                               cudnn_frontend::DataType_t::NOT_SET,
-                               cudnn_frontend::DataType_t::NOT_SET};
+    FADescriptor_v1 descriptor{
+        b,
+        h,
+        hg,
+        s_q,
+        s_kv,
+        d_qk,
+        d_v,
+        num_pages_k,
+        num_pages_v,
+        page_size_k,
+        page_size_v,
+        max_pages_per_seq_k,
+        max_pages_per_seq_v,
+        bias_b,
+        bias_h,
+        scaling_factor,
+        is_training,
+        dropout_probability,
+        layout,
+        bias_type,
+        mask_type,
+        softmax_type,
+        window_size_left,
+        window_size_right,
+        true,
+        tensorType,
+        cudnn_frontend::DataType_t::NOT_SET,
+        cudnn_frontend::DataType_t::NOT_SET,
+        cudnn_frontend::DataType_t::NOT_SET,
+        return_max_logit,
+    };
 
     namespace fe = cudnn_frontend;
     using graph_and_tensors =
@@ -141,7 +145,8 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
                    std::shared_ptr<fe::graph::Tensor_attributes>,   // V
                    std::shared_ptr<fe::graph::Tensor_attributes>,   // attn_scale
                    std::shared_ptr<fe::graph::Tensor_attributes>,   // O
-                   std::shared_ptr<fe::graph::Tensor_attributes>,   // Stats
+                   std::shared_ptr<fe::graph::Tensor_attributes>,   // S1
+                   std::shared_ptr<fe::graph::Tensor_attributes>,   // S2
                    std::shared_ptr<fe::graph::Tensor_attributes>,   // bias
                    std::shared_ptr<fe::graph::Tensor_attributes>,   // softmax_offset
                    std::shared_ptr<fe::graph::Tensor_attributes>,   // seq_q
@@ -244,6 +249,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
       sdpa_options = fe::graph::SDPA_attributes()
                          .set_name("flash_attention")
                          .set_is_inference(false)
+                         .set_generate_stats(generate_stats)
                          .set_causal_mask(is_causal)
                          .set_causal_mask_bottom_right(is_bottom_right)
                          .set_attn_scale(attn_scale);
@@ -317,7 +323,36 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
         sdpa_options.set_sink_token(softmax_offset);
       }
 
-      auto [O, Stats] = mha_graph->sdpa(Q, K, V, sdpa_options);
+      std::shared_ptr<fe::graph::Tensor_attributes> Max, Sum_Exp;
+      if (is_ragged_q && cudnn_runtime_version >= 90600) {
+        offset_stats =
+            mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_name("offset_stats")
+                                  .set_dim({b + 1, 1, 1, 1})
+                                  .set_stride({1, 1, 1, 1})
+                                  .set_data_type(get_cudnn_fe_dtype(ragged_offset_type)));
+      }
+      if (return_max_logit) {
+        Max = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Max")
+                                    .set_dim({b, h, s_q, 1})
+                                    .set_data_type(fe::DataType_t::FLOAT));
+        Sum_Exp = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("Sum_Exp")
+                                        .set_dim({b, h, s_q, 1})
+                                        .set_data_type(fe::DataType_t::FLOAT));
+        if (is_ragged_q && cudnn_runtime_version >= 90600) {
+          Max->set_stride({h * s_q, 1, h, 1}).set_ragged_offset(offset_stats);
+          Sum_Exp->set_stride({h * s_q, 1, h, 1}).set_ragged_offset(offset_stats);
+        } else {
+          Max->set_stride({h * s_q, s_q, 1, 1});
+          Sum_Exp->set_stride({h * s_q, s_q, 1, 1});
+        }
+        sdpa_options.set_logit_max(Max);
+        sdpa_options.set_score_sum_exp(Sum_Exp);
+      }
+
+      auto [O, Stats] = mha_graph->sdpa(Q, K, V, std::move(sdpa_options));
 
       std::vector<int64_t> o_stride(4);
       generateMatrixStrides(b, h, s_q, s_kv, d_v, o_stride.data(), layout,
@@ -332,17 +367,13 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
         O->set_ragged_offset(offset_o);
       }
 
-      Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({b, h, s_q, 1});
-      if (is_ragged_q && cudnn_runtime_version >= 90600) {
-        offset_stats =
-            mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_name("offset_stats")
-                                  .set_dim({b + 1, 1, 1, 1})
-                                  .set_stride({1, 1, 1, 1})
-                                  .set_data_type(get_cudnn_fe_dtype(ragged_offset_type)));
-        Stats->set_stride({h * s_q, 1, h, 1}).set_ragged_offset(offset_stats);
-      } else {
-        Stats->set_stride({h * s_q, s_q, 1, 1});
+      if (!return_max_logit) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({b, h, s_q, 1});
+        if (is_ragged_q && cudnn_runtime_version >= 90600) {
+          Stats->set_stride({h * s_q, 1, h, 1}).set_ragged_offset(offset_stats);
+        } else {
+          Stats->set_stride({h * s_q, s_q, 1, 1});
+        }
       }
 
       std::tuple<std::shared_ptr<fe::graph::Tensor_attributes>,  // Q
@@ -351,7 +382,8 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
                  std::shared_ptr<fe::graph::Tensor_attributes>,  // attn_scale
                  std::shared_ptr<fe::graph::Tensor_attributes>>  // O
           key_tensors_tuple = std::make_tuple(Q, K, V, attn_scale, O);
-      auto Stats_tuple = std::make_tuple(Stats);
+      auto Stats_tuple =
+          generate_stats ? std::make_tuple(Stats, nullptr) : std::make_tuple(Max, Sum_Exp);
       auto bias_tuple = is_bias ? std::make_tuple(bias) : std::make_tuple(nullptr);
       auto softmax_offset_tuple =
           is_softmax_offset ? std::make_tuple(softmax_offset) : std::make_tuple(nullptr);
@@ -384,7 +416,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
       return return_tuple;
     };
 
-    auto [mha_graph, Q, K, V, attn_scale, O, Stats, bias, softmax_offset, seq_q, seq_kv,
+    auto [mha_graph, Q, K, V, attn_scale, O, S1, S2, bias, softmax_offset, seq_q, seq_kv,
           page_table_k, page_table_v, offset_q, offset_o, offset_k, offset_v, offset_stats,
           dropout_seed, dropout_offset] = get_graph(sdpa_f16_fprop_cache, descriptor);
 
@@ -417,9 +449,12 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
 
     // Build variant pack
     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack = {
-        {Q, devPtrQ}, {K, devPtrK},
-        {V, devPtrV}, {attn_scale, &scaling_factor},
-        {O, devPtrO}, {Stats, devPtrSoftmaxStats}};
+        {Q, devPtrQ}, {K, devPtrK},  {V, devPtrV}, {attn_scale, &scaling_factor},
+        {O, devPtrO}, {S1, devPtrS1}};
+
+    if (return_max_logit) {
+      variant_pack[S2] = devPtrS2;
+    }
 
     if (is_bias) {
       variant_pack[bias] = devPtrBias;
@@ -561,35 +596,38 @@ void fused_attn_arbitrary_seqlen_bwd_impl(
   const DType ragged_offset_type = cudnn_runtime_version >= 90500 ? DType::kInt64 : DType::kInt32;
 
   try {
-    FADescriptor_v1 descriptor{b,
-                               h,
-                               hg,
-                               s_q,
-                               s_kv,
-                               d_qk,
-                               d_v,
-                               0,
-                               0,
-                               0,
-                               0,
-                               0,
-                               0,
-                               bias_b,
-                               bias_h,
-                               scaling_factor,
-                               true,
-                               dropout_probability,
-                               layout,
-                               bias_type,
-                               mask_type,
-                               softmax_type,
-                               window_size_left,
-                               window_size_right,
-                               deterministic,
-                               tensorType,
-                               cudnn_frontend::DataType_t::NOT_SET,
-                               cudnn_frontend::DataType_t::NOT_SET,
-                               cudnn_frontend::DataType_t::NOT_SET};
+    FADescriptor_v1 descriptor{
+        b,
+        h,
+        hg,
+        s_q,
+        s_kv,
+        d_qk,
+        d_v,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        bias_b,
+        bias_h,
+        scaling_factor,
+        true,
+        dropout_probability,
+        layout,
+        bias_type,
+        mask_type,
+        softmax_type,
+        window_size_left,
+        window_size_right,
+        deterministic,
+        tensorType,
+        cudnn_frontend::DataType_t::NOT_SET,
+        cudnn_frontend::DataType_t::NOT_SET,
+        cudnn_frontend::DataType_t::NOT_SET,
+        false,
+    };
 
     namespace fe = cudnn_frontend;
     using graph_and_tensors =
@@ -1001,12 +1039,13 @@ void fused_attn_arbitrary_seqlen_bwd_impl(
 using namespace transformer_engine::fused_attn;
 void fused_attn_arbitrary_seqlen_fwd_qkvpacked(
     size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, size_t num_tokens,
-    bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
-    int64_t window_size_left, int64_t window_size_right, const Tensor *input_QKV,
-    const Tensor *input_Bias, const Tensor *input_SoftmaxOffset, Tensor *output_O,
-    NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *cu_seqlens_padded,
-    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) {
+    bool is_training, bool return_max_logit, float attn_scale, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    const Tensor *input_QKV, const Tensor *input_Bias, const Tensor *input_SoftmaxOffset,
+    Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens,
+    const Tensor *cu_seqlens_padded, const Tensor *rng_state, Tensor *workspace,
+    cudaStream_t stream, cudnnHandle_t handle) {
   using namespace transformer_engine;
 
   const auto QKV_type = input_QKV->data.dtype;
@@ -1037,7 +1076,8 @@ void fused_attn_arbitrary_seqlen_fwd_qkvpacked(
   }
 
   void *devPtrO = output_O->data.dptr;
-  void *devPtrS = nullptr;
+  void *devPtrS1 = nullptr;
+  void *devPtrS2 = nullptr;
   void *devPtrCuSeqlens = cu_seqlens->data.dptr;
   void *devPtrSeqOffsets = cu_seqlens_padded->data.dptr;
 
@@ -1051,14 +1091,34 @@ void fused_attn_arbitrary_seqlen_fwd_qkvpacked(
   size_t i = 0;
   if (Aux_CTX_Tensors->size == 0) {
     const auto cudnn_runtime_version = cudnnGetVersion();
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    output_S->data.dptr = nullptr;
-    if (qkv_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
-      output_S->data.shape = {max_tokens, num_attn_heads, 1};
+    if (return_max_logit) {
+      Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_Max->data.dptr = nullptr;
+      if (qkv_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_Max->data.shape = {max_tokens, num_attn_heads, 1};
+      } else {
+        output_Max->data.shape = {batch, num_attn_heads, max_seqlen, 1};
+      }
+      output_Max->data.dtype = DType::kFloat32;
+      Tensor *output_Sum_Exp = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_Sum_Exp->data.dptr = nullptr;
+      if (qkv_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_Sum_Exp->data.shape = {max_tokens, num_attn_heads, 1};
+      } else {
+        output_Sum_Exp->data.shape = {batch, num_attn_heads, max_seqlen, 1};
+      }
+      output_Sum_Exp->data.dtype = DType::kFloat32;
     } else {
-      output_S->data.shape = {batch, num_attn_heads, max_seqlen, 1};
+      Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_S->data.dptr = nullptr;
+      if (qkv_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_S->data.shape = {max_tokens, num_attn_heads, 1};
+      } else {
+        output_S->data.shape = {batch, num_attn_heads, max_seqlen, 1};
+      }
+      output_S->data.dtype = DType::kFloat32;
     }
-    output_S->data.dtype = DType::kFloat32;
+
     Tensor *output_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_rng_state->data.dptr = nullptr;
     output_rng_state->data.shape = {2};
@@ -1080,8 +1140,15 @@ void fused_attn_arbitrary_seqlen_fwd_qkvpacked(
 
     Aux_CTX_Tensors->size = i;
   } else if (Aux_CTX_Tensors->size >= 2) {
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    devPtrS = output_S->data.dptr;
+    if (return_max_logit) {
+      Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS1 = output_Max->data.dptr;
+      Tensor *output_Sum_Exp = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS2 = output_Sum_Exp->data.dptr;
+    } else {
+      Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS1 = output_S->data.dptr;
+    }
     Tensor *output_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_rng_state->data.dptr = rng_state->data.dptr;
     if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) {
@@ -1105,11 +1172,11 @@ void fused_attn_arbitrary_seqlen_fwd_qkvpacked(
   fused_attn_arbitrary_seqlen_fwd_impl(
       batch, num_attn_heads, num_attn_heads, max_seqlen, max_seqlen, head_dim, head_dim,
       max_batch_size, max_tokens, max_tokens, 0, 0, 0, 0, 0, 0, bias_b, bias_h, is_training,
-      attn_scale, p_dropout, qkv_layout, bias_type, mask_type, softmax_type, window_size_left,
-      window_size_right, devPtrQ, devPtrK, devPtrV, devPtrBias, devPtrSoftmaxOffset, devPtrS,
-      devPtrO, devPtrDropoutSeed, devPtrDropoutOffset, devPtrCuSeqlens, devPtrCuSeqlens, nullptr,
-      nullptr, devPtrSeqOffsets, devPtrSeqOffsets, get_cudnn_fe_dtype(QKV_type),
-      workspace->data.dptr, &workspace_size, stream, handle);
+      return_max_logit, attn_scale, p_dropout, qkv_layout, bias_type, mask_type, softmax_type,
+      window_size_left, window_size_right, devPtrQ, devPtrK, devPtrV, devPtrBias,
+      devPtrSoftmaxOffset, devPtrS1, devPtrS2, devPtrO, devPtrDropoutSeed, devPtrDropoutOffset,
+      devPtrCuSeqlens, devPtrCuSeqlens, nullptr, nullptr, devPtrSeqOffsets, devPtrSeqOffsets,
+      get_cudnn_fe_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle);
 
   if (workspace_size > 0) {
     if (workspace->data.dptr == nullptr) {
@@ -1221,14 +1288,15 @@ void fused_attn_arbitrary_seqlen_fwd_kvpacked(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim, size_t num_tokens_q, size_t num_tokens_kv,
     size_t num_pages_k, size_t num_pages_v, size_t page_size_k, size_t page_size_v,
-    size_t max_pages_per_seq_k, size_t max_pages_per_seq_v, bool is_training, float attn_scale,
-    float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    const Tensor *input_Q, const Tensor *input_KV, const Tensor *input_Bias,
-    const Tensor *input_SoftmaxOffset, Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors,
-    const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, const Tensor *cu_seqlens_q_padded,
-    const Tensor *cu_seqlens_kv_padded, const Tensor *page_table_k, const Tensor *page_table_v,
-    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) {
+    size_t max_pages_per_seq_k, size_t max_pages_per_seq_v, bool is_training, bool return_max_logit,
+    float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, const Tensor *input_Q, const Tensor *input_KV,
+    const Tensor *input_Bias, const Tensor *input_SoftmaxOffset, Tensor *output_O,
+    NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
+    const Tensor *cu_seqlens_q_padded, const Tensor *cu_seqlens_kv_padded,
+    const Tensor *page_table_k, const Tensor *page_table_v, const Tensor *rng_state,
+    Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) {
   using namespace transformer_engine;
 
   const auto QKV_type = input_Q->data.dtype;
@@ -1260,7 +1328,8 @@ void fused_attn_arbitrary_seqlen_fwd_kvpacked(
   }
 
   void *devPtrO = output_O->data.dptr;
-  void *devPtrS = nullptr;
+  void *devPtrS1 = nullptr;
+  void *devPtrS2 = nullptr;
 
   void *devPtrCuSeqlensQ = cu_seqlens_q->data.dptr;
   void *devPtrCuSeqlensKV = cu_seqlens_kv->data.dptr;
@@ -1285,14 +1354,34 @@ void fused_attn_arbitrary_seqlen_fwd_kvpacked(
   size_t i = 0;
   if (Aux_CTX_Tensors->size == 0) {
     const auto cudnn_runtime_version = cudnnGetVersion();
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    output_S->data.dptr = nullptr;
-    if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
-      output_S->data.shape = {max_tokens_q, num_attn_heads, 1};
+    if (return_max_logit) {
+      Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_Max->data.dptr = nullptr;
+      if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_Max->data.shape = {max_tokens_q, num_attn_heads, 1};
+      } else {
+        output_Max->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      }
+      output_Max->data.dtype = DType::kFloat32;
+      Tensor *output_Sum_Exp = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_Sum_Exp->data.dptr = nullptr;
+      if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_Sum_Exp->data.shape = {max_tokens_q, num_attn_heads, 1};
+      } else {
+        output_Sum_Exp->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      }
+      output_Sum_Exp->data.dtype = DType::kFloat32;
     } else {
-      output_S->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_S->data.dptr = nullptr;
+      if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_S->data.shape = {max_tokens_q, num_attn_heads, 1};
+      } else {
+        output_S->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      }
+      output_S->data.dtype = DType::kFloat32;
     }
-    output_S->data.dtype = DType::kFloat32;
+
     Tensor *output_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_rng_state->data.dptr = nullptr;
     output_rng_state->data.shape = {2};
@@ -1314,8 +1403,15 @@ void fused_attn_arbitrary_seqlen_fwd_kvpacked(
 
     Aux_CTX_Tensors->size = i;
   } else if (Aux_CTX_Tensors->size >= 2) {
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    devPtrS = output_S->data.dptr;
+    if (return_max_logit) {
+      Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS1 = output_Max->data.dptr;
+      Tensor *output_Sum_Exp = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS2 = output_Sum_Exp->data.dptr;
+    } else {
+      Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS1 = output_S->data.dptr;
+    }
     Tensor *output_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_rng_state->data.dptr = rng_state->data.dptr;
     if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) {
@@ -1340,11 +1436,12 @@ void fused_attn_arbitrary_seqlen_fwd_kvpacked(
       batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim, head_dim,
       max_batch_size, max_tokens_q, max_tokens_kv, num_pages_k, num_pages_v, page_size_k,
       page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h, is_training,
-      attn_scale, p_dropout, qkv_layout, bias_type, mask_type, softmax_type, window_size_left,
-      window_size_right, devPtrQ, devPtrK, devPtrV, devPtrBias, devPtrSoftmaxOffset, devPtrS,
-      devPtrO, devPtrDropoutSeed, devPtrDropoutOffset, devPtrCuSeqlensQ, devPtrCuSeqlensKV,
-      devPtrPageTableK, devPtrPageTableV, devPtrSeqOffsetsQ, devPtrSeqOffsetsKV,
-      get_cudnn_fe_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle);
+      return_max_logit, attn_scale, p_dropout, qkv_layout, bias_type, mask_type, softmax_type,
+      window_size_left, window_size_right, devPtrQ, devPtrK, devPtrV, devPtrBias,
+      devPtrSoftmaxOffset, devPtrS1, devPtrS2, devPtrO, devPtrDropoutSeed, devPtrDropoutOffset,
+      devPtrCuSeqlensQ, devPtrCuSeqlensKV, devPtrPageTableK, devPtrPageTableV, devPtrSeqOffsetsQ,
+      devPtrSeqOffsetsKV, get_cudnn_fe_dtype(QKV_type), workspace->data.dptr, &workspace_size,
+      stream, handle);
 
   if (workspace_size > 0) {
     if (workspace->data.dptr == nullptr) {
@@ -1471,14 +1568,14 @@ void fused_attn_arbitrary_seqlen_fwd(
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, size_t num_tokens_q,
     size_t num_tokens_kv, size_t num_pages_k, size_t num_pages_v, size_t page_size_k,
     size_t page_size_v, size_t max_pages_per_seq_k, size_t max_pages_per_seq_v, bool is_training,
-    float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V,
-    const Tensor *input_Bias, const Tensor *input_SoftmaxOffset, Tensor *output_O,
-    NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
-    const Tensor *cu_seqlens_q_padded, const Tensor *cu_seqlens_kv_padded,
-    const Tensor *page_table_k, const Tensor *page_table_v, const Tensor *rng_state,
-    Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) {
+    bool return_max_logit, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout,
+    NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
+    int64_t window_size_left, int64_t window_size_right, const Tensor *input_Q,
+    const Tensor *input_K, const Tensor *input_V, const Tensor *input_Bias,
+    const Tensor *input_SoftmaxOffset, Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors,
+    const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, const Tensor *cu_seqlens_q_padded,
+    const Tensor *cu_seqlens_kv_padded, const Tensor *page_table_k, const Tensor *page_table_v,
+    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle) {
   using namespace transformer_engine;
 
   const auto QKV_type = input_Q->data.dtype;
@@ -1488,7 +1585,8 @@ void fused_attn_arbitrary_seqlen_fwd(
   void *devPtrK = input_K->data.dptr;
   void *devPtrV = input_V->data.dptr;
   void *devPtrO = output_O->data.dptr;
-  void *devPtrS = nullptr;
+  void *devPtrS1 = nullptr;
+  void *devPtrS2 = nullptr;
   void *devPtrBias = nullptr;
   size_t bias_b = 0;
   size_t bias_h = 0;
@@ -1525,14 +1623,34 @@ void fused_attn_arbitrary_seqlen_fwd(
   size_t i = 0;
   if (Aux_CTX_Tensors->size == 0) {
     const auto cudnn_runtime_version = cudnnGetVersion();
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    output_S->data.dptr = nullptr;
-    if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
-      output_S->data.shape = {max_tokens_q, num_attn_heads, 1};
+    if (return_max_logit) {
+      Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_Max->data.dptr = nullptr;
+      if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_Max->data.shape = {max_tokens_q, num_attn_heads, 1};
+      } else {
+        output_Max->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      }
+      output_Max->data.dtype = DType::kFloat32;
+      Tensor *output_Sum_Exp = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_Sum_Exp->data.dptr = nullptr;
+      if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_Sum_Exp->data.shape = {max_tokens_q, num_attn_heads, 1};
+      } else {
+        output_Sum_Exp->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      }
+      output_Sum_Exp->data.dtype = DType::kFloat32;
     } else {
-      output_S->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      output_S->data.dptr = nullptr;
+      if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+        output_S->data.shape = {max_tokens_q, num_attn_heads, 1};
+      } else {
+        output_S->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
+      }
+      output_S->data.dtype = DType::kFloat32;
     }
-    output_S->data.dtype = DType::kFloat32;
+
     Tensor *output_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_rng_state->data.dptr = nullptr;
     output_rng_state->data.shape = {2};
@@ -1554,8 +1672,15 @@ void fused_attn_arbitrary_seqlen_fwd(
 
     Aux_CTX_Tensors->size = i;
   } else if (Aux_CTX_Tensors->size >= 2) {
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    devPtrS = output_S->data.dptr;
+    if (return_max_logit) {
+      Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS1 = output_Max->data.dptr;
+      Tensor *output_Sum_Exp = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS2 = output_Sum_Exp->data.dptr;
+    } else {
+      Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
+      devPtrS1 = output_S->data.dptr;
+    }
     Tensor *output_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_rng_state->data.dptr = rng_state->data.dptr;
     if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) {
@@ -1580,11 +1705,12 @@ void fused_attn_arbitrary_seqlen_fwd(
       batch, num_attn_heads, num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk, head_dim_v,
       max_batch_size, max_tokens_q, max_tokens_kv, num_pages_k, num_pages_v, page_size_k,
       page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h, is_training,
-      attn_scale, p_dropout, qkv_layout, bias_type, mask_type, softmax_type, window_size_left,
-      window_size_right, devPtrQ, devPtrK, devPtrV, devPtrBias, devPtrSoftmaxOffset, devPtrS,
-      devPtrO, devPtrDropoutSeed, devPtrDropoutOffset, devPtrCuSeqlensQ, devPtrCuSeqlensKV,
-      devPtrPageTableK, devPtrPageTableV, devPtrSeqOffsetsQ, devPtrSeqOffsetsKV,
-      get_cudnn_fe_dtype(QKV_type), workspace->data.dptr, &workspace_size, stream, handle);
+      return_max_logit, attn_scale, p_dropout, qkv_layout, bias_type, mask_type, softmax_type,
+      window_size_left, window_size_right, devPtrQ, devPtrK, devPtrV, devPtrBias,
+      devPtrSoftmaxOffset, devPtrS1, devPtrS2, devPtrO, devPtrDropoutSeed, devPtrDropoutOffset,
+      devPtrCuSeqlensQ, devPtrCuSeqlensKV, devPtrPageTableK, devPtrPageTableV, devPtrSeqOffsetsQ,
+      devPtrSeqOffsetsKV, get_cudnn_fe_dtype(QKV_type), workspace->data.dptr, &workspace_size,
+      stream, handle);
 
   if (workspace_size > 0) {
     if (workspace->data.dptr == nullptr) {
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
index b9658b0530..a3181c6295 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
@@ -20,12 +20,13 @@ namespace transformer_engine {
 #if (CUDNN_VERSION >= 8900)
 void fused_attn_arbitrary_seqlen_fwd_qkvpacked(
     size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, size_t num_tokens,
-    bool is_training, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
-    int64_t window_size_left, int64_t window_size_right, const Tensor *input_QKV,
-    const Tensor *input_Bias, const Tensor *input_SoftmaxOffset, Tensor *output_O,
-    NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens, const Tensor *cu_seqlens_padded,
-    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
+    bool is_training, bool return_max_logit, float attn_scale, float p_dropout,
+    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
+    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
+    const Tensor *input_QKV, const Tensor *input_Bias, const Tensor *input_SoftmaxOffset,
+    Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens,
+    const Tensor *cu_seqlens_padded, const Tensor *rng_state, Tensor *workspace,
+    cudaStream_t stream, cudnnHandle_t handle);
 
 void fused_attn_arbitrary_seqlen_bwd_qkvpacked(
     size_t batch, size_t num_attn_heads, size_t max_seqlen, size_t head_dim, size_t num_tokens,
@@ -41,14 +42,15 @@ void fused_attn_arbitrary_seqlen_fwd_kvpacked(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim, size_t num_tokens_q, size_t num_tokens_kv,
     size_t num_pages_k, size_t num_pages_v, size_t page_size_k, size_t page_size_v,
-    size_t max_pages_per_seq_k, size_t max_pages_per_seq_v, bool is_training, float attn_scale,
-    float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    const Tensor *input_Q, const Tensor *input_KV, const Tensor *input_Bias,
-    const Tensor *input_SoftmaxOffset, Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors,
-    const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, const Tensor *cu_seqlens_q_padded,
-    const Tensor *cu_seqlens_kv_padded, const Tensor *page_table_k, const Tensor *page_table_v,
-    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
+    size_t max_pages_per_seq_k, size_t max_pages_per_seq_v, bool is_training, bool return_max_logit,
+    float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, const Tensor *input_Q, const Tensor *input_KV,
+    const Tensor *input_Bias, const Tensor *input_SoftmaxOffset, Tensor *output_O,
+    NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
+    const Tensor *cu_seqlens_q_padded, const Tensor *cu_seqlens_kv_padded,
+    const Tensor *page_table_k, const Tensor *page_table_v, const Tensor *rng_state,
+    Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
 
 void fused_attn_arbitrary_seqlen_bwd_kvpacked(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
@@ -68,14 +70,14 @@ void fused_attn_arbitrary_seqlen_fwd(
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, size_t num_tokens_q,
     size_t num_tokens_kv, size_t num_pages_k, size_t num_pages_v, size_t page_size_k,
     size_t page_size_v, size_t max_pages_per_seq_k, size_t max_pages_per_seq_v, bool is_training,
-    float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, const Tensor *input_Q, const Tensor *input_K, const Tensor *input_V,
-    const Tensor *input_Bias, const Tensor *input_SoftmaxOffset, Tensor *output_O,
-    NVTETensorPack *Aux_CTX_Tensors, const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv,
-    const Tensor *cu_seqlens_q_padded, const Tensor *cu_seqlens_kv_padded,
-    const Tensor *page_table_k, const Tensor *page_table_v, const Tensor *rng_state,
-    Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
+    bool return_max_logit, float attn_scale, float p_dropout, NVTE_QKV_Layout qkv_layout,
+    NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
+    int64_t window_size_left, int64_t window_size_right, const Tensor *input_Q,
+    const Tensor *input_K, const Tensor *input_V, const Tensor *input_Bias,
+    const Tensor *input_SoftmaxOffset, Tensor *output_O, NVTETensorPack *Aux_CTX_Tensors,
+    const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, const Tensor *cu_seqlens_q_padded,
+    const Tensor *cu_seqlens_kv_padded, const Tensor *page_table_k, const Tensor *page_table_v,
+    const Tensor *rng_state, Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
 
 void fused_attn_arbitrary_seqlen_bwd(
     size_t batch, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index 21c544491a..7b85be972c 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -1710,7 +1710,8 @@ void fused_attn_fp8_fwd_impl_v1(
                                qkv_tensor_type,
                                o_tensor_type,
                                cudnn_frontend::DataType_t::NOT_SET,
-                               cudnn_frontend::DataType_t::NOT_SET};
+                               cudnn_frontend::DataType_t::NOT_SET,
+                               false};
 
     namespace fe = cudnn_frontend;
     using graph_and_tensors =
@@ -2038,7 +2039,8 @@ void fused_attn_fp8_bwd_impl_v1(
                                qkv_tensor_type,
                                o_tensor_type,
                                do_tensor_type,
-                               dqkv_tensor_type};
+                               dqkv_tensor_type,
+                               false};
 
     namespace fe = cudnn_frontend;
     using graph_and_tensors =
diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h
index f03774f8ed..72047a73f2 100644
--- a/transformer_engine/common/fused_attn/utils.h
+++ b/transformer_engine/common/fused_attn/utils.h
@@ -115,20 +115,21 @@ struct FADescriptor_v1 {
   cudnn_frontend::DataType_t o_tensor_type;
   cudnn_frontend::DataType_t do_tensor_type;
   cudnn_frontend::DataType_t dqkv_tensor_type;
+  bool generate_max_sum_exp;
 
   bool operator<(const FADescriptor_v1 &rhs) const {
     return std::tie(b, h, hg, s_q, s_kv, d_qk, d_v, num_pages_k, num_pages_v, page_size_k,
                     page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h,
                     attnScale, isTraining, dropoutProbability, layout, mask_type, softmax_type,
                     window_size_left, window_size_right, deterministic, bias_type, qkv_tensor_type,
-                    o_tensor_type, do_tensor_type, dqkv_tensor_type) <
+                    o_tensor_type, do_tensor_type, dqkv_tensor_type, generate_max_sum_exp) <
            std::tie(rhs.b, rhs.h, rhs.hg, rhs.s_q, rhs.s_kv, rhs.d_qk, rhs.d_v, rhs.num_pages_k,
                     rhs.num_pages_v, rhs.page_size_k, rhs.page_size_v, rhs.max_pages_per_seq_k,
                     rhs.max_pages_per_seq_v, rhs.bias_b, rhs.bias_h, rhs.attnScale, rhs.isTraining,
                     rhs.dropoutProbability, rhs.layout, rhs.mask_type, rhs.softmax_type,
                     rhs.window_size_left, rhs.window_size_right, rhs.deterministic, rhs.bias_type,
                     rhs.qkv_tensor_type, rhs.o_tensor_type, rhs.do_tensor_type,
-                    rhs.dqkv_tensor_type);
+                    rhs.dqkv_tensor_type, rhs.generate_max_sum_exp);
   }
 };
 
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index a150978c4a..518fad20de 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -190,29 +190,30 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 
 /*! \brief Get fused attention backend based on input parameters.
  *
- *  \param[in]     is_training       Whether the model is in training mode.
- *  \param[in]     q_dtype           The data type of Tensor Q.
- *  \param[in]     kv_dtype          The data type of Tensors K, V.
- *  \param[in]     qkv_layout        The layout of Tensors Q, K, V.
- *  \param[in]     bias_type         The attention bias type.
- *  \param[in]     attn_mask_type    The attention mask type.
- *  \param[in]     softmax_type      The attention softmax type.
- *  \param[in]     dropout           The dropout probability.
- *  \param[in]     num_attn_heads    The number of heads in Q.
- *  \param[in]     num_gqa_groups    The number of heads in K, V.
- *  \param[in]     max_seqlen_q      The sequence length of Q.
- *  \param[in]     max_seqlen_kv     The sequence length of K, V.
- *  \param[in]     head_dim_qk       The head dimension of Q, K.
- *  \param[in]     head_dim_v        The head dimension of V.
- *  \param[in]     window_size_left  Sliding window size (the left half).
- *  \param[in]     window_size_right Sliding window size (the right half).
+ *  \param[in]     is_training         Whether the model is in training mode.
+ *  \param[in]     q_dtype             The data type of Tensor Q.
+ *  \param[in]     kv_dtype            The data type of Tensors K, V.
+ *  \param[in]     qkv_layout          The layout of Tensors Q, K, V.
+ *  \param[in]     bias_type           The attention bias type.
+ *  \param[in]     attn_mask_type      The attention mask type.
+ *  \param[in]     softmax_type        The attention softmax type.
+ *  \param[in]     dropout             The dropout probability.
+ *  \param[in]     num_attn_heads      The number of heads in Q.
+ *  \param[in]     num_gqa_groups      The number of heads in K, V.
+ *  \param[in]     max_seqlen_q        The sequence length of Q.
+ *  \param[in]     max_seqlen_kv       The sequence length of K, V.
+ *  \param[in]     head_dim_qk         The head dimension of Q, K.
+ *  \param[in]     head_dim_v          The head dimension of V.
+ *  \param[in]     window_size_left    Sliding window size (the left half).
+ *  \param[in]     window_size_right   Sliding window size (the right half).
+ *  \param[in]     return_max_logit    Whether to produce Max and Sum_Exp, or Stats.
  */
 NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTE_QKV_Layout qkv_layout,
     NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
     float dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right);
+    int64_t window_size_right, bool return_max_logit);
 
 /*! \brief Compute dot product attention with packed QKV input.
  *
@@ -255,6 +256,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
  *  \param[in]     max_seqlen               Max sequence length used for computing,
  *                                          it may be >= max(seqlen_i) for i=0,...batch_size-1.
  *  \param[in]     is_training              Whether this is in training mode or inference.
+ *  \param[in]     return_max_logit         Whether to produce Max and Sum_Exp, or Stats.
  *  \param[in]     attn_scale               Scaling factor for Q * K.T.
  *  \param[in]     dropout                  Dropout probability.
  *  \param[in]     qkv_layout               QKV tensor's layout.
@@ -266,13 +268,16 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
  *  \param[in]     workspace                Workspace tensor.
  *  \param[in]     stream                   CUDA stream used for this operation.
  */
-void nvte_fused_attn_fwd_qkvpacked(
-    const NVTETensor QKV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S,
-    NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens,
-    const NVTETensor cu_seqlens_padded, const NVTETensor rng_state, size_t max_seqlen,
-    bool is_training, float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    int64_t window_size_left, int64_t window_size_right, NVTETensor workspace, cudaStream_t stream);
+void nvte_fused_attn_fwd_qkvpacked(const NVTETensor QKV, const NVTETensor Bias,
+                                   const NVTETensor SoftmaxOffset, NVTETensor S, NVTETensor O,
+                                   NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens,
+                                   const NVTETensor cu_seqlens_padded, const NVTETensor rng_state,
+                                   size_t max_seqlen, bool is_training, bool return_max_logit,
+                                   float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
+                                   NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
+                                   NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+                                   int64_t window_size_right, NVTETensor workspace,
+                                   cudaStream_t stream);
 
 /*! \brief Compute the backward of the dot product attention with packed QKV input.
  *
@@ -381,6 +386,7 @@ void nvte_fused_attn_bwd_qkvpacked(const NVTETensor QKV, const NVTETensor O, con
  *  \param[in]     max_seqlen_kv             Max sequence length used for computing for KV.
  *                                           it may be >= max(seqlen_kv_i) for i=0,...batch_size-1.
  *  \param[in]     is_training               Whether this is in training mode or inference.
+ *  \param[in]     return_max_logit          Whether to produce Max and Sum_Exp, or Stats.
  *  \param[in]     attn_scale                Scaling factor for Q * K.T.
  *  \param[in]     dropout                   Dropout probability.
  *  \param[in]     qkv_layout                QKV tensor's layout.
@@ -399,7 +405,7 @@ void nvte_fused_attn_fwd_kvpacked(
     const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded,
     const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k,
     const NVTETensor page_table_v, const NVTETensor rng_state, size_t max_seqlen_q,
-    size_t max_seqlen_kv, bool is_training, float attn_scale, float dropout,
+    size_t max_seqlen_kv, bool is_training, bool return_max_logit, float attn_scale, float dropout,
     NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
     NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
     NVTETensor workspace, cudaStream_t stream);
@@ -520,6 +526,7 @@ void nvte_fused_attn_bwd_kvpacked(
  *  \param[in]     max_seqlen_kv             Max sequence length used for computing for K and V.
  *                                           it may be >= max(seqlen_kv_i) for i=0,...batch_size-1.
  *  \param[in]     is_training               Whether this is in training mode or inference.
+ *  \param[in]     return_max_logit          Whether to produce Max and Sum_Exp, or Stats.
  *  \param[in]     attn_scale                Scaling factor for Q * K.T.
  *  \param[in]     dropout                   Dropout probability.
  *  \param[in]     qkv_layout                QKV tensors' layout.
@@ -531,18 +538,16 @@ void nvte_fused_attn_bwd_kvpacked(
  *  \param[in]     workspace                 Workspace tensor.
  *  \param[in]     stream                    CUDA stream used for this operation.
  */
-void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETensor V,
-                         const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S,
-                         NVTETensor O, NVTETensorPack *Aux_CTX_Tensors,
-                         const NVTETensor cu_seqlens_q, const NVTETensor cu_seqlens_kv,
-                         const NVTETensor cu_seqlens_q_padded,
-                         const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k,
-                         const NVTETensor page_table_v, const NVTETensor rng_state,
-                         size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training,
-                         float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
-                         NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
-                         NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-                         int64_t window_size_right, NVTETensor workspace, cudaStream_t stream);
+void nvte_fused_attn_fwd(
+    const NVTETensor Q, const NVTETensor K, const NVTETensor V, const NVTETensor Bias,
+    const NVTETensor SoftmaxOffset, NVTETensor S, NVTETensor O, NVTETensorPack *Aux_CTX_Tensors,
+    const NVTETensor cu_seqlens_q, const NVTETensor cu_seqlens_kv,
+    const NVTETensor cu_seqlens_q_padded, const NVTETensor cu_seqlens_kv_padded,
+    const NVTETensor page_table_k, const NVTETensor page_table_v, const NVTETensor rng_state,
+    size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, bool return_max_logit,
+    float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
+    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
+    int64_t window_size_right, NVTETensor workspace, cudaStream_t stream);
 
 /*! \brief Compute the backward of the dot product attention with separate Q, K and V.
  *
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index 9277569e11..ffc0706fe7 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -22,7 +22,8 @@ NVTE_Fused_Attn_Backend GetFusedAttnBackend(bool is_training, DType q_dtype, DTy
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout,
       bias_type, mask_type, softmax_type, dropout_probability, q_attn_heads, kv_attn_heads,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right);
+      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
+      false);
   return backend;
 }
 
@@ -179,17 +180,18 @@ pybind11::tuple GetFusedAttnForwardWorkspaceSizes(
           qkv_tensor.data(), bias_tensor.data(), dummy_softmax_offset_tensor.data(),
           s_tensor.data(), o_tensor.data(), &aux_output_tensors, q_cu_seqlens_tensor.data(),
           ragged_offset_tensor.data(), dummy_rng_state_tensor.data(), q_max_seqlen, is_training,
-          scaling_factor, dropout_probability, qkv_layout, bias_type, mask_type, softmax_type,
-          window_size_left, window_size_right, query_workspace_tensor.data(), nullptr);
+          false, scaling_factor, dropout_probability, qkv_layout, bias_type, mask_type,
+          softmax_type, window_size_left, window_size_right, query_workspace_tensor.data(),
+          nullptr);
     } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) {
       nvte_fused_attn_fwd_kvpacked(
           q_tensor.data(), kv_tensor.data(), bias_tensor.data(), dummy_softmax_offset_tensor.data(),
           s_tensor.data(), o_tensor.data(), &aux_output_tensors, q_cu_seqlens_tensor.data(),
           kv_cu_seqlens_tensor.data(), ragged_offset_tensor.data(), ragged_offset_tensor.data(),
           dummy_page_table_tensor.data(), dummy_page_table_tensor.data(),
-          dummy_rng_state_tensor.data(), q_max_seqlen, kv_max_seqlen, is_training, scaling_factor,
-          dropout_probability, qkv_layout, bias_type, mask_type, softmax_type, window_size_left,
-          window_size_right, query_workspace_tensor.data(), nullptr);
+          dummy_rng_state_tensor.data(), q_max_seqlen, kv_max_seqlen, is_training, false,
+          scaling_factor, dropout_probability, qkv_layout, bias_type, mask_type, softmax_type,
+          window_size_left, window_size_right, query_workspace_tensor.data(), nullptr);
     } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_HD_HD) {
       nvte_fused_attn_fwd(
           q_tensor.data(), k_tensor.data(), v_tensor.data(), bias_tensor.data(),
@@ -197,8 +199,8 @@ pybind11::tuple GetFusedAttnForwardWorkspaceSizes(
           q_cu_seqlens_tensor.data(), kv_cu_seqlens_tensor.data(), ragged_offset_tensor.data(),
           ragged_offset_tensor.data(), dummy_page_table_tensor.data(),
           dummy_page_table_tensor.data(), dummy_rng_state_tensor.data(), q_max_seqlen,
-          kv_max_seqlen, is_training, scaling_factor, dropout_probability, qkv_layout, bias_type,
-          mask_type, softmax_type, window_size_left, window_size_right,
+          kv_max_seqlen, is_training, false, scaling_factor, dropout_probability, qkv_layout,
+          bias_type, mask_type, softmax_type, window_size_left, window_size_right,
           query_workspace_tensor.data(), nullptr);
     } else {
       NVTE_ERROR("Unsupported QKVLayout.");
@@ -276,7 +278,8 @@ static void FusedAttnForwardImpl(
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype), qkv_layout,
       bias_type, mask_type, softmax_type, dropout_probability, attn_heads, num_gqa_groups,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right);
+      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
+      false);
   nvte_populate_rng_state_async(rng_state, seed, q_max_seqlen, kv_max_seqlen, backend, stream);
 
   /* Auxiliary tensors (to be propagated to the backward pass later) */
@@ -294,7 +297,7 @@ static void FusedAttnForwardImpl(
     nvte_fused_attn_fwd_qkvpacked(
         qkv_tensor.data(), bias_tensor.data(), dummy_softmax_offset_tensor.data(), s_tensor.data(),
         o_tensor.data(), &aux_output_tensors, q_cu_seqlens_tensor.data(),
-        q_seq_offsets_tensor.data(), rng_state_tensor.data(), q_max_seqlen, is_training,
+        q_seq_offsets_tensor.data(), rng_state_tensor.data(), q_max_seqlen, is_training, false,
         scaling_factor, dropout_probability, qkv_layout, bias_type, mask_type, softmax_type,
         window_size_left, window_size_right, workspace_tensor.data(), stream);
   } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) {
@@ -308,8 +311,8 @@ static void FusedAttnForwardImpl(
         s_tensor.data(), o_tensor.data(), &aux_output_tensors, q_cu_seqlens_tensor.data(),
         kv_cu_seqlens_tensor.data(), q_seq_offsets_tensor.data(), k_seq_offsets_tensor.data(),
         dummy_page_table_tensor.data(), dummy_page_table_tensor.data(), rng_state_tensor.data(),
-        q_max_seqlen, kv_max_seqlen, is_training, scaling_factor, dropout_probability, qkv_layout,
-        bias_type, mask_type, softmax_type, window_size_left, window_size_right,
+        q_max_seqlen, kv_max_seqlen, is_training, false, scaling_factor, dropout_probability,
+        qkv_layout, bias_type, mask_type, softmax_type, window_size_left, window_size_right,
         workspace_tensor.data(), stream);
   } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_HD_HD) {
     auto q_shape = std::vector<size_t>{input_batch * q_max_seqlen, attn_heads, qk_head_dim};
@@ -323,7 +326,7 @@ static void FusedAttnForwardImpl(
         dummy_softmax_offset_tensor.data(), s_tensor.data(), o_tensor.data(), &aux_output_tensors,
         q_cu_seqlens_tensor.data(), kv_cu_seqlens_tensor.data(), q_seq_offsets_tensor.data(),
         k_seq_offsets_tensor.data(), dummy_page_table_tensor.data(), dummy_page_table_tensor.data(),
-        rng_state_tensor.data(), q_max_seqlen, kv_max_seqlen, is_training, scaling_factor,
+        rng_state_tensor.data(), q_max_seqlen, kv_max_seqlen, is_training, false, scaling_factor,
         dropout_probability, qkv_layout, bias_type, mask_type, softmax_type, window_size_left,
         window_size_right, workspace_tensor.data(), stream);
   } else {
@@ -542,7 +545,8 @@ static void FusedAttnBackwardImpl(
   auto backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype), qkv_layout,
       bias_type, mask_type, softmax_type, dropout_probability, attn_heads, num_gqa_groups,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right);
+      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
+      false);
   PrepareFusedAttnBackwardAuxTensors(&aux_input_tensors, input_batch, bias_batch, attn_heads,
                                      bias_heads, q_max_seqlen, kv_max_seqlen, dtype, backend,
                                      softmax_aux, rng_state, bias);
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
index 6dfe0d31b3..95558e30da 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/backends.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -58,6 +58,8 @@
     combine_and_quantize,
     combine_and_dequantize,
     print_quantizers,
+    ConvertTHDtoBSHD,
+    ConvertBSHDtoTHD,
 )
 from transformer_engine.pytorch.attention.dot_product_attention.utils import (
     AttentionLogging as attn_log,
@@ -201,6 +203,7 @@ def __init__(
         attention_dropout_ctx: Optional[Callable] = nullcontext,
         layer_number: Optional[int] = None,
         softmax_type: str = "vanilla",
+        return_max_logit: Optional[bool] = False,
     ) -> None:
         super().__init__()
 
@@ -209,6 +212,7 @@ def __init__(
         self.attention_dropout_ctx = attention_dropout_ctx
         self.layer_number = layer_number
         self.softmax_type = softmax_type
+        self.return_max_logit = return_max_logit
 
         def mask_func(x, y):
             return (
@@ -217,6 +221,7 @@ def mask_func(x, y):
                 else attention_mask_func(x, y)
             )
 
+        self.mask_func = mask_func
         self.scale_mask_softmax = FusedScaleMaskSoftmax(mask_func)
 
         # Dropout. Note that for a single iteration, this layer will generate
@@ -238,6 +243,8 @@ def forward(
         qkv_layout: str = "sbh3d",
         cu_seqlens_q: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
         cu_seqlens_kv: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        max_seqlen_q: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        max_seqlen_kv: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
         attn_mask_type: str = "causal",
         attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
         window_size: Optional[Tuple[int, int]] = None,
@@ -261,6 +268,9 @@ def forward(
         if inference_params is not None and inference_params.is_paged:
             key_layer, value_layer = inference_params.convert_paged_to_nonpaged(self.layer_number)
 
+        # convert to sbhd
+        # training: bshd, thd
+        # inference: bshd, sbhd_2bshd, thd_2bshd
         if qkv_format == "bshd":
             # convert to sbhd and use sbhd implementation for now
             query_layer, key_layer, value_layer = [
@@ -269,9 +279,8 @@ def forward(
         if qkv_format == "sbhd_2bshd":
             key_layer, value_layer = [x.transpose(0, 1) for x in [key_layer, value_layer]]
 
-        total_tokens, batch_size = None, None
         if qkv_format == "thd_2bshd":
-            total_tokens, batch_size = query_layer.shape[0], key_layer.shape[0]
+            batch_size = key_layer.shape[0]
             query_layer = tex.convert_thd_to_bshd(
                 query_layer,
                 cu_seqlens_q,
@@ -281,6 +290,26 @@ def forward(
             query_layer, key_layer, value_layer = [
                 x.transpose(0, 1) for x in [query_layer, key_layer, value_layer]
             ]
+        if qkv_format == "thd":
+            assert cu_seqlens_q is not None and cu_seqlens_kv is not None
+            assert max_seqlen_q is not None and max_seqlen_kv is not None
+            query_layer = ConvertTHDtoBSHD.apply(
+                query_layer,
+                cu_seqlens_q,
+                max_seqlen_q,
+            )
+            key_layer, value_layer = [
+                ConvertTHDtoBSHD.apply(
+                    x,
+                    cu_seqlens_kv,
+                    max_seqlen_kv,
+                )
+                for x in [key_layer, value_layer]
+            ]
+            query_layer, key_layer, value_layer = [
+                x.transpose(0, 1).contiguous() for x in [query_layer, key_layer, value_layer]
+            ]
+
         batch_size, max_seqlen_q, max_seqlen_kv = (
             query_layer.shape[1],
             query_layer.shape[0],
@@ -426,6 +455,15 @@ def forward(
                 matmul_result, None, None, dP_quantizer, "dP_quantizer", None
             )
 
+        # max attention score
+        max_logit = None
+        if self.return_max_logit:
+            # matmul_result [b, np, sq, dk], max_logit [np]
+            max_logit = matmul_result
+            if attn_mask_type != "no_mask":
+                max_logit = self.mask_func(matmul_result, attention_mask)
+            max_logit = torch.amax(max_logit, dim=(0, 2, 3))
+
         # add attention sink to the last column: [b, np, sq, sk+1]
         if self.softmax_type != "vanilla":
             matmul_result = torch.cat(
@@ -506,14 +544,13 @@ def forward(
             context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 
             # [b, sq, np, hn] --> [tq, np, hn]
-            context_layer = tex.convert_bshd_to_thd(
+            context_layer = ConvertBSHDtoTHD.apply(
                 context_layer,
                 cu_seqlens_q,
-                total_tokens,
             )
 
             # [tq, np, hn] --> [tq, hp]
-            context_layer = context_layer.view(total_tokens, -1)
+            context_layer = context_layer.view(context_layer.shape[0], -1)
 
         if fp8:
             # quantize and dequantize O to emulate FP8
@@ -529,6 +566,9 @@ def forward(
             if fp8_output:
                 context_layer = O_quantizer(context_layer)
 
+        if self.return_max_logit:
+            return context_layer, max_logit
+
         return context_layer
 
 
@@ -1067,6 +1107,7 @@ def forward(
         softmax_offset,
         fp8_output,
         layer_number,
+        return_max_logit,
     ):
         # pylint: disable=missing-function-docstring
 
@@ -1102,6 +1143,7 @@ def forward(
         # FP8 attention:       torch.float16 or torch.bfloat16
         out_nominal_dtype = q.dtype
 
+        max_logit = None
         if fp8:
             fused_attention_backend = FusedAttnBackend["FP8"]
 
@@ -1129,7 +1171,7 @@ def forward(
             # DelayedScaling:       Float8Tensor; dtype = torch.float16 or torch.bfloat16
             #                                     fp8_dtype = tex.DType.kFloat8E4M3
             # Float8CurrentScaling: torch.Tensor; dtype = torch.float16 or torch.bfloat16
-            out_, aux_ctx_tensors = fused_attn_fwd(
+            out_, aux_ctx_tensors, *_ = fused_attn_fwd(
                 is_training,
                 max_seqlen_q,
                 max_seqlen_kv,
@@ -1205,7 +1247,7 @@ def forward(
                 qkvo_tensors = (q, k, v, out)
         else:
             # q, k, v, out_: torch.Tensor; dtype = torch.float16 or torch.bfloat16
-            out_, aux_ctx_tensors = fused_attn_fwd(
+            out_, aux_ctx_tensors, *max_logit = fused_attn_fwd(
                 is_training,
                 max_seqlen_q,
                 max_seqlen_kv,
@@ -1233,6 +1275,7 @@ def forward(
                 window_size,
                 rng_gen,
                 softmax_offset,
+                return_max_logit,
             )
             out = out_
             out_ret = out_
@@ -1327,10 +1370,12 @@ def forward(
         ctx.use_FAv2_bwd = use_FAv2_bwd
         ctx.deterministic = deterministic
 
+        if return_max_logit:
+            return out_ret, *max_logit
         return out_ret
 
     @staticmethod
-    def backward(ctx, d_out):
+    def backward(ctx, d_out, *_args):
         # pylint: disable=missing-function-docstring
 
         # d_out is expected to be in FP8 if is_output_fp8=True,
@@ -1574,6 +1619,7 @@ def backward(ctx, d_out):
             d_softmax_offset,
             None,
             None,
+            None,
         )
 
 
@@ -1614,6 +1660,7 @@ def __init__(
         layer_number: Optional[int] = None,
         deterministic: bool = False,
         softmax_type: str = "vanilla",
+        return_max_logit: Optional[bool] = False,
     ) -> None:
         super().__init__()
 
@@ -1627,6 +1674,7 @@ def __init__(
         self.layer_number = 1 if layer_number is None else layer_number
         self.deterministic = deterministic
         self.softmax_type = softmax_type
+        self.return_max_logit = return_max_logit
 
         def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unused-argument
             """
@@ -1846,6 +1894,7 @@ def forward(
                     softmax_offset=softmax_offset,
                     fp8_output=fp8_output,
                     layer_number=self.layer_number,
+                    return_max_logit=self.return_max_logit,
                 )
         else:
             with self.attention_dropout_ctx():
@@ -1881,7 +1930,11 @@ def forward(
                     softmax_offset,
                     fp8_output,
                     self.layer_number,
+                    self.return_max_logit,
                 )
 
+        if self.return_max_logit:
+            # ...hd -> ...(hd)
+            return output[0].view(*output[0].shape[:-2], -1), output[1]
         # ...hd -> ...(hd)
         return output.view(*output.shape[:-2], -1)
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
index a474cb809a..a503147be8 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
@@ -617,6 +617,7 @@ def cp_p2p_fwd_fused_attn(
     rank,
     step,
     cp_size,
+    return_max_logit,
     q_part,
     k_part,
     v_part,
@@ -693,7 +694,7 @@ def cp_p2p_fwd_fused_attn(
         fp8_meta_kwargs["s_quantizer"] = S_quantizer_per_step
         fp8_meta_kwargs["o_quantizer"] = O_quantizer_per_step
 
-    out_per_step, aux_ctx_tensors = fused_attn_fwd(
+    out_per_step, aux_ctx_tensors, *max_logit = fused_attn_fwd(
         is_training,
         max_seqlen_q_,
         max_seqlen_kv_,
@@ -713,6 +714,7 @@ def cp_p2p_fwd_fused_attn(
         cu_seqlens_q_padded=cu_seqlens_q_padded_,
         cu_seqlens_kv_padded=cu_seqlens_kv_padded_,
         **fp8_meta_kwargs,
+        return_max_logit=return_max_logit,
     )
 
     if fp8:
@@ -721,7 +723,9 @@ def cp_p2p_fwd_fused_attn(
         softmax_lse_per_step, rng_states, *rest = aux_ctx_tensors
         attn_bias = rest[0] if len(rest) > 0 else None
 
-    return out_per_step, softmax_lse_per_step, rng_states, attn_bias
+    if return_max_logit:
+        return out_per_step, softmax_lse_per_step, rng_states, attn_bias, *max_logit
+    return out_per_step, softmax_lse_per_step, rng_states, attn_bias, None
 
 
 def cp_p2p_fwd_flash_attn(
@@ -1086,6 +1090,7 @@ def forward(
         attn_bias,
         deterministic,
         use_fused_attention,
+        return_max_logit,
         fp8,
         fp8_meta,
         cp_group,
@@ -1156,6 +1161,8 @@ def forward(
         amax_per_step = None
         S_quantizer_per_step = [None for _ in range(cp_size)]
         O_quantizer_per_step = [None for _ in range(cp_size)]
+        max_logit_per_step = [None for _ in range(cp_size)]
+        max_logit = None
 
         assert isinstance(k, q.__class__) and isinstance(
             v, q.__class__
@@ -1244,6 +1251,10 @@ def forward(
             q_f16 = q
             if use_fused_attention:
                 fused_attn_backend = FusedAttnBackend["F16_arbitrary_seqlen"]
+            if return_max_logit:
+                max_logit_per_step = [
+                    torch.empty(q.shape[-2], dtype=q.dtype, device=q.device) for _ in range(cp_size)
+                ]
 
         # split qkv to two halves and prepare for load balancing
         assert qkv_format == "thd" or (
@@ -1418,6 +1429,7 @@ def forward(
                             rank,
                             i,
                             cp_size,
+                            return_max_logit,
                         ]
                     else:
                         flash_attn_inputs = [
@@ -1462,6 +1474,7 @@ def forward(
                                     softmax_lse_per_step[i],
                                     rng_states[i],
                                     attn_biases[i],
+                                    max_logit_per_step[i],
                                 ) = cp_p2p_fwd_fused_attn(
                                     *fused_attn_inputs, *prepare_outputs, section
                                 )
@@ -1488,6 +1501,7 @@ def forward(
                                     softmax_lse_per_step[i],
                                     rng_states[i],
                                     attn_biases[i],
+                                    max_logit_per_step[i],
                                 ) = cp_p2p_fwd_fused_attn(
                                     *fused_attn_inputs, *prepare_outputs, section
                                 )
@@ -1514,6 +1528,7 @@ def forward(
                                     softmax_lse_per_step[i],
                                     rng_states[i],
                                     attn_biases[i],
+                                    max_logit_per_step[i],
                                 ) = cp_p2p_fwd_fused_attn(
                                     *fused_attn_inputs, *prepare_outputs, section
                                 )
@@ -1541,6 +1556,7 @@ def forward(
                                 softmax_lse_per_step[i],
                                 rng_states[i],
                                 attn_biases[i],
+                                max_logit_per_step[i],
                             ) = cp_p2p_fwd_fused_attn(*fused_attn_inputs, *prepare_outputs, section)
                         else:
                             out_per_step[i], softmax_lse_per_step[i], rng_states[i] = (
@@ -1600,11 +1616,20 @@ def forward(
                                 softmax_lse.view(*softmax_lse.shape[:-1], 2, -1),
                                 softmax_lse_per_step[i - 1],
                             )
+                    if return_max_logit:
+                        if i == 1:
+                            max_logit = torch.clone(max_logit_per_step[0])
+                        else:
+                            max_logit = torch.maximum(max_logit, max_logit_per_step[i - 1])
 
                 if i < cp_size:
                     flash_attn_streams[(i - 1) % 2].record_event(fwd_results_correction_done)
 
         torch.cuda.current_stream().wait_stream(flash_attn_streams[1])
+        if return_max_logit:
+            torch.distributed.all_reduce(
+                max_logit, op=torch.distributed.ReduceOp.MAX, group=cp_group
+            )
 
         second_half_lse_seqlen = None
         if causal and rank < (cp_size - 1):
@@ -1682,6 +1707,10 @@ def forward(
                 elif qkv_format == "sbhd":
                     # [s*b, h, d] -> [s, b, h, d]
                     out = out.view(-1, ctx.batch_size, *out.shape[-2:])
+            if return_max_logit:
+                max_logit = flash_attn_a2a_communicate_softmax_offset(
+                    max_logit, 0, cp_size_a2a, cp_group_a2a, cp_stream, False
+                )
         elif not use_fused_attention:
             out = out.view(-1, *out.shape[-2:])
 
@@ -1811,10 +1840,12 @@ def forward(
 
         nvtx_range_pop(f"{nvtx_label}")
 
+        if return_max_logit:
+            return out_ret, max_logit
         return out_ret
 
     @staticmethod
-    def backward(ctx, dout):
+    def backward(ctx, dout, *_args):
         # pylint: disable=missing-function-docstring
 
         # add NVTX range
@@ -2522,6 +2553,7 @@ def backward(ctx, dout):
             None,
             None,
             None,
+            None,
         )
 
 
@@ -2577,6 +2609,7 @@ def forward(
         attn_bias,
         deterministic,
         use_fused_attention,
+        return_max_logit,
         window_size,
         cp_group,
         cp_stream,
@@ -2682,6 +2715,8 @@ def forward(
         softmax_lse_per_step = [None, None]
         rng_states = [None, None]
         out = torch.empty_like(q)
+        max_logit_per_step = [None, None]
+        max_logit = None
 
         for i in range(len(local_seq_chunk_ids) + 1):
             if i < len(local_seq_chunk_ids):
@@ -2712,7 +2747,11 @@ def forward(
                     # [s_range, b, h, d] -> [b, s_range, h, d] or [s_range, b, h, d]
                     k_, v_ = [x.movedim(0, seq_dim).contiguous() for x in [k_, v_]]
                     if use_fused_attention:
-                        out_per_step[i], [softmax_lse_per_step[i], rng_states[i]] = fused_attn_fwd(
+                        (
+                            out_per_step[i],
+                            [softmax_lse_per_step[i], rng_states[i]],
+                            *max_logit_,
+                        ) = fused_attn_fwd(
                             is_training,
                             max_seqlen_q,
                             max_seqlen_kv_,
@@ -2732,7 +2771,10 @@ def forward(
                             cu_seqlens_q_padded=cu_seqlens_q_padded,
                             cu_seqlens_kv_padded=cu_seqlens_kv_per_step[i],
                             window_size=window_size_per_step[i],
+                            return_max_logit=return_max_logit,
                         )
+                        if return_max_logit:
+                            max_logit_per_step[i] = max_logit_[0]
                     else:
                         fa_forward_args_thd = get_fa_args(
                             True,
@@ -2767,14 +2809,22 @@ def forward(
                             if not use_flash_attn_3:
                                 rng_states[i] = fa_outputs[3]
 
+            if return_max_logit and i == 0:
+                max_logit = torch.clone(max_logit_per_step[0])
             if i > 0:
                 with torch.cuda.stream(flash_attn_streams[i - 1]):
                     if qkv_format == "bshd":
                         out[:, i - 1].copy_(out_per_step[i - 1])
                     elif qkv_format == "sbhd":
                         out[i - 1].copy_(out_per_step[i - 1])
+                if return_max_logit:
+                    max_logit = torch.maximum(max_logit, max_logit_per_step[i - 1])
 
         torch.cuda.current_stream().wait_stream(cp_stream)
+        if return_max_logit:
+            torch.distributed.all_reduce(
+                max_logit, op=torch.distributed.ReduceOp.MAX, group=cp_group
+            )
 
         if use_fused_attention:
             if qkv_format == "bshd":
@@ -2811,10 +2861,12 @@ def forward(
         ctx.use_fused_attention = use_fused_attention
         ctx.use_flash_attn_3 = use_flash_attn_3
         nvtx_range_pop("transformer_engine.AttnFuncWithCPAndKVAllGather.forward")
+        if return_max_logit:
+            return out, max_logit
         return out
 
     @staticmethod
-    def backward(ctx, dout):
+    def backward(ctx, dout, *_args):
         # pylint: disable=missing-function-docstring
         nvtx_range_push("transformer_engine.AttnFuncWithCPAndKVAllGather.backward")
         cp_size = get_distributed_world_size(ctx.cp_group)
@@ -3035,6 +3087,7 @@ def backward(ctx, dout):
             None,
             None,
             None,
+            None,
         )
 
 
@@ -3065,6 +3118,7 @@ def forward(
         attn_bias,
         deterministic,
         use_fused_attention,
+        return_max_logit,
         window_size,
         fp8,
         fp8_meta,
@@ -3158,6 +3212,7 @@ def forward(
             fp8_recipe = fp8_meta["local_recipes"][0]
         fwd_nominal_dtype = q.dtype
         fused_attn_backend = None
+        max_logit = None
 
         QKV_quantizer, O_quantizer, S_quantizer, dQKV_quantizer, dO_quantizer, dP_quantizer = (
             dpa_utils.get_attention_quantizers(fp8, quantizers)
@@ -3203,7 +3258,7 @@ def forward(
                     Float8Tensor.make_like(x, data=y, dtype=fwd_nominal_dtype)
                     for x, y in zip([q_fp8, k_fp8, v_fp8], [q_part, k_part, v_part])
                 ]
-            out_, aux_ctx_tensors = fused_attn_fwd(
+            out_, aux_ctx_tensors, *max_logit = fused_attn_fwd(
                 is_training,
                 max_seqlen_q,
                 max_seqlen_kv,
@@ -3226,6 +3281,7 @@ def forward(
                 **fp8_meta_kwargs,
                 softmax_type=softmax_type,
                 softmax_offset=softmax_offset,
+                return_max_logit=return_max_logit,
             )
             if isinstance(out_, Float8Tensor):
                 out_fp8 = out_
@@ -3276,6 +3332,10 @@ def forward(
         out_ = flash_attn_a2a_communicate(
             out_, chunk_ids_for_a2a, seq_dim, cp_size, cp_group, cp_stream, False
         )
+        if return_max_logit:
+            max_logit = flash_attn_a2a_communicate_softmax_offset(
+                *max_logit, 0, cp_size, cp_group, cp_stream, False
+            )
 
         if use_fused_attention:
             if qkv_format == "bshd":
@@ -3362,10 +3422,12 @@ def forward(
             ctx.S_quantizer = S_quantizer.copy()
             ctx.S_quantizer.scale = S_quantizer.scale.clone()
         nvtx_range_pop("transformer_engine.AttnFuncWithCPAndQKVOA2A.forward")
+        if return_max_logit:
+            return out_ret, max_logit
         return out_ret
 
     @staticmethod
-    def backward(ctx, dout):
+    def backward(ctx, dout, *_args):
         # pylint: disable=missing-function-docstring
         nvtx_range_push("transformer_engine.AttnFuncWithCPAndQKVOA2A.backward")
         cp_size = get_distributed_world_size(ctx.cp_group)
@@ -3599,6 +3661,7 @@ def backward(ctx, dout):
             None,
             None,
             None,
+            None,
             d_softmax_offset,
             None,
         )
@@ -3637,6 +3700,7 @@ def attn_forward_func_with_cp(
     softmax_offset=None,
     fp8_output=False,
     layer_number=1,
+    return_max_logit=False,
 ) -> torch.Tensor:
     """
     Attention implementation with context parallelism (CP). CP partitions tensors along the sequence
@@ -3784,6 +3848,7 @@ def attn_forward_func_with_cp(
         attn_bias,
         deterministic,
         use_fused_attention,
+        return_max_logit,
     ]
 
     if cp_comm_type in ["p2p", "a2a+p2p"]:
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 6d9ce9a522..0d1c0b0c05 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -255,6 +255,12 @@ class DotProductAttention(TransformerEngineBaseModule):
                  where alpha is a learnable parameter in shape [h].
                  'off-by-one' and 'learnable' softmax types are also called sink attention
                  ('zero sink' and 'learnable sink').
+    return_max_logit: Optional[bool], default = `False`
+                     If true, returns the maximum attention score that can be used in a Muon optimizer to
+                     rescale the Q and K projection weights (see `Muon is Scalable for LLM Training
+                     <https://arxiv.org/pdf/2502.16982>`_).
+                     max_logit = max(S), where S = mask(Q*K^T*softmax_scale + bias) in shape [b, h, s_q, s_kv],
+                     and max_logit is in shape [h].
 
     Parallelism parameters
     ----------------------
@@ -311,6 +317,7 @@ def __init__(
         cp_comm_type: str = "p2p",
         softmax_scale: Optional[float] = None,
         softmax_type: str = "vanilla",
+        return_max_logit: Optional[bool] = False,
     ) -> None:
         super().__init__()
 
@@ -394,6 +401,7 @@ def __init__(
 
         self.attention_type = attention_type
         self.attention_dropout = attention_dropout
+        self.return_max_logit = return_max_logit
 
         self.softmax_type = softmax_type
         if self.softmax_type == "vanilla":
@@ -431,6 +439,7 @@ def __init__(
             deterministic=self.deterministic,
             **attn_kwargs,
             softmax_type=self.softmax_type,
+            return_max_logit=self.return_max_logit,
         )
 
         self.unfused_attention = UnfusedDotProductAttention(
@@ -439,6 +448,7 @@ def __init__(
             **attn_kwargs,
             layer_number=layer_number,
             softmax_type=self.softmax_type,
+            return_max_logit=self.return_max_logit,
         )
 
         def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unused-argument
@@ -1303,6 +1313,7 @@ def forward(
                 fp8_meta=self.fp8_meta,
                 inference_params=inference_params,
                 softmax_type=self.softmax_type,
+                return_max_logit=self.return_max_logit,
             )
             global _attention_backends
             if is_in_onnx_export_mode():
@@ -1502,6 +1513,8 @@ def forward(
                         qkv_layout=qkv_layout,
                         cu_seqlens_q=cu_seqlens_q,
                         cu_seqlens_kv=cu_seqlens_kv,
+                        max_seqlen_q=max_seqlen_q,
+                        max_seqlen_kv=max_seqlen_kv,
                         attn_mask_type=attn_mask_type,
                         attention_mask=attention_mask,
                         window_size=window_size,
@@ -1523,6 +1536,8 @@ def forward(
                     qkv_layout=qkv_layout,
                     cu_seqlens_q=cu_seqlens_q,
                     cu_seqlens_kv=cu_seqlens_kv,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_kv=max_seqlen_kv,
                     attn_mask_type=attn_mask_type,
                     attention_mask=attention_mask,
                     window_size=window_size,
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index b45edc716d..50b00f2ceb 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -229,6 +229,8 @@ class AttentionParams:
         Inference-related parameters. See InferenceParams for details.
     softmax_type: str, default = "vanilla"
         The type of softmax operation. See DotProductAttention for details.
+    return_max_logit: bool, default = `False`
+        Whether to output max_logit.
     """
 
     qkv_type: Union[torch.Tensor, Float8Tensor] = torch.Tensor
@@ -257,6 +259,7 @@ class AttentionParams:
     fp8_meta: Union[Dict[str, Any], None] = None
     inference_params: Optional[InferenceParams] = None
     softmax_type: str = "vanilla"
+    return_max_logit: bool = False
 
     def __eq__(self, other):
         """
@@ -330,6 +333,7 @@ def get_attention_backend(
     fp8_meta = attention_params.fp8_meta
     inference_params = attention_params.inference_params
     softmax_type = attention_params.softmax_type
+    return_max_logit = attention_params.return_max_logit
 
     # Run config
     logger = logging.getLogger("DotProductAttention")
@@ -477,6 +481,20 @@ def get_attention_backend(
                 logger.debug("Disabling FusedAttention for FP8 current scaling with cuDNN < 9.14.0")
                 use_fused_attention = False
 
+    # Filter: Return max_logit
+    if return_max_logit:
+        if use_flash_attention:
+            use_flash_attention = False
+            logger.debug("Disabling FlashAttention for max_logit")
+        if use_fused_attention and qkv_format == "thd":
+            use_fused_attention = False
+            logger.debug("Disabling FusedAttention for max_logit with qkv_format = thd")
+        if fp8 and fp8_meta["recipe"].fp8_dpa:
+            use_flash_attention = False
+            use_fused_attention = False
+            use_unfused_attention = False
+            logger.debug("Disabling all backends for max_logit with FP8 attention")
+
     # Filter: KV cache
     # backend  | precision      |    KV cache     | architecture | qkv_format    | page_size
     # ---------------------------------------------------------------------------------------
@@ -913,6 +931,7 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
             head_dim_v,
             window_size[0],
             window_size[1],
+            return_max_logit,
         )
         if fused_attention_backend == FusedAttnBackend["No_Backend"]:
             logger.debug("Disabling FusedAttention as no backend supports the provided input")
@@ -1649,6 +1668,78 @@ def backward(ctx, grad_output):
         return None, None, _pack_tensor(indices, grad_output)
 
 
+class ConvertTHDtoBSHD(torch.autograd.Function):
+    """
+    Convert a tensor from qkv_format = thd to qkv_format = bshd.
+    """
+
+    @staticmethod
+    def forward(ctx, thd_tensor, cu_seqlens, max_seqlen):
+        # pylint: disable=missing-function-docstring
+        batch_size = cu_seqlens.shape[0] - 1
+        if not thd_tensor.is_contiguous():
+            thd_tensor = thd_tensor.contiguous()
+        bshd_tensor = tex.convert_thd_to_bshd(
+            thd_tensor,
+            cu_seqlens,
+            batch_size,
+            max_seqlen,
+        )
+        ctx.save_for_backward(cu_seqlens)
+        ctx.num_tokens = thd_tensor.shape[0]
+        return bshd_tensor
+
+    @staticmethod
+    def backward(ctx, bshd_tensor):
+        # pylint: disable=missing-function-docstring
+        (cu_seqlens,) = ctx.saved_tensors
+        if not bshd_tensor.is_contiguous():
+            bshd_tensor = bshd_tensor.contiguous()
+        thd_tensor = tex.convert_bshd_to_thd(
+            bshd_tensor,
+            cu_seqlens,
+            ctx.num_tokens,
+        )
+        return thd_tensor, None, None
+
+
+class ConvertBSHDtoTHD(torch.autograd.Function):
+    """
+    Convert a tensor from qkv_format = bshd to qkv_format = thd.
+    """
+
+    @staticmethod
+    def forward(ctx, bshd_tensor, cu_seqlens):
+        # pylint: disable=missing-function-docstring
+        num_tokens = cu_seqlens[-1]
+        max_seqlen = bshd_tensor.shape[1]
+        if not bshd_tensor.is_contiguous():
+            bshd_tensor = bshd_tensor.contiguous()
+        thd_tensor = tex.convert_bshd_to_thd(
+            bshd_tensor,
+            cu_seqlens,
+            num_tokens,
+        )
+        ctx.save_for_backward(cu_seqlens)
+        ctx.max_seqlen = max_seqlen
+        return thd_tensor
+
+    @staticmethod
+    def backward(ctx, thd_tensor):
+        # pylint: disable=missing-function-docstring
+        (cu_seqlens,) = ctx.saved_tensors
+        batch_size = cu_seqlens.shape[0] - 1
+        if not thd_tensor.is_contiguous():
+            thd_tensor = thd_tensor.contiguous()
+        bshd_tensor = tex.convert_thd_to_bshd(
+            thd_tensor,
+            cu_seqlens,
+            batch_size,
+            ctx.max_seqlen,
+        )
+        return bshd_tensor, None
+
+
 def get_qkv_format(
     qkv_layout: str = "bshd_bshd_bshd",
     inference_params: InferenceParams = None,
diff --git a/transformer_engine/pytorch/cpp_extensions/fused_attn.py b/transformer_engine/pytorch/cpp_extensions/fused_attn.py
index 94a12c4a09..690e9f9869 100644
--- a/transformer_engine/pytorch/cpp_extensions/fused_attn.py
+++ b/transformer_engine/pytorch/cpp_extensions/fused_attn.py
@@ -139,6 +139,7 @@ def fused_attn_fwd(
     window_size: Tuple[int, int] = (-1, -1),
     rng_gen: torch.Generator = None,
     softmax_offset: torch.Tensor = None,
+    return_max_logit: bool = False,
 ) -> Tuple[Union[torch.Tensor, None], ...]:
     """Fused Attention FWD for separate QKV input.
 
@@ -216,6 +217,8 @@ def fused_attn_fwd(
     softmax_offset: torch.Tensor, default = None
                 softmax offset tensor in shape [1, h_q, 1, 1].
                 See softmax_type in DotProductAttention for details.
+    return_max_logit: bool, default = False
+                      whether to return the maximum attention score
 
     Returns
     ----------
@@ -246,6 +249,7 @@ def fused_attn_fwd(
                 rng_state: torch.Tensor, optional, if backend is not F16_max512_seqlen
                     state of the random number generator;
                     [seed, offset], dtype uint64
+    max_logit: if return_max_logit = True, shape [h] and same data type as O; otherwise None
     """
 
     if attn_scale is None:
@@ -315,8 +319,22 @@ def fused_attn_fwd(
         softmax_offset,
         rng_gen,
         rng_elts_per_thread,
+        return_max_logit,
     )
 
+    if return_max_logit:
+        qkv_format = qkv_layout.replace("3", "").replace("2", "").split("_")[0]
+        # thd:  output_tensors: out [tq, h, d],    Max [tq, h, 1],    Sum_Exp [tq, h, 1]
+        # bshd: output_tensors: out [b, sq, h, d], Max [b, h, sq, 1], Sum_Exp [b, h, sq, 1]
+        # sbhd: output_tensors: out [sq, b, h, d], Max [b, h, sq, 1], Sum_Exp [b, h, sq, 1]
+        stats = output_tensors[1] + torch.log(output_tensors[2])
+        amax_dims = (0, 2) if qkv_format == "thd" else (0, 2, 3)
+        # Max -> max_logit [h]
+        max_logit = torch.amax(output_tensors[1], dim=amax_dims).to(dtype=output_tensors[0].dtype)
+        aux_ctx_tensors = [stats]
+        aux_ctx_tensors.extend(output_tensors[3:])
+        return output_tensors[0], aux_ctx_tensors, max_logit
+
     # out, aux_ctx_tensors
     return output_tensors[0], output_tensors[1:]
 
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index d86a96959c..79fb798422 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -76,7 +76,7 @@ NVTE_Fused_Attn_Backend get_fused_attn_backend(
     NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
     float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right);
+    int64_t window_size_right, bool return_max_logit);
 
 std::pair<TensorWrapper, py::object> quantizer_helper(py::handle quantizer,
                                                       const std::vector<size_t> &shape, DType dtype,
@@ -94,7 +94,7 @@ std::vector<py::object> fused_attn_fwd(
     const std::optional<at::Tensor> page_table_k, const std::optional<at::Tensor> page_table_v,
     py::handle s_quantizer, py::handle o_quantizer, const std::optional<at::Tensor> Bias,
     const std::optional<at::Tensor> SoftmaxOffset, const std::optional<at::Generator> rng_gen,
-    size_t rng_elts_per_thread);
+    size_t rng_elts_per_thread, bool return_max_logit);
 
 std::vector<py::object> fused_attn_bwd(
     size_t max_seqlen_q, size_t max_seqlen_kv, float attn_scale, float p_dropout, bool set_zero,
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index 344bc4ab0b..f66c8aa619 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -45,11 +45,12 @@ NVTE_Fused_Attn_Backend get_fused_attn_backend(
     NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
     float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right) {
+    int64_t window_size_right, bool return_max_logit) {
   NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
       is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout,
       bias_type, attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups,
-      max_seqlen_q, max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right);
+      max_seqlen_q, max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right,
+      return_max_logit);
   return fused_attention_backend;
 }
 
@@ -106,7 +107,7 @@ std::vector<py::object> fused_attn_fwd(
     const std::optional<at::Tensor> page_table_k, const std::optional<at::Tensor> page_table_v,
     py::handle s_quantizer, py::handle o_quantizer, const std::optional<at::Tensor> Bias,
     const std::optional<at::Tensor> SoftmaxOffset, const std::optional<at::Generator> rng_gen,
-    size_t rng_elts_per_thread) {
+    size_t rng_elts_per_thread, bool return_max_logit) {
   auto none = py::none();
 
   // create QKV tensor wrappers
@@ -228,8 +229,9 @@ std::vector<py::object> fused_attn_fwd(
         te_O.data(), &nvte_aux_tensor_pack, te_cu_seqlens_q.data(), te_cu_seqlens_kv.data(),
         te_cu_seqlens_q_padded.data(), te_cu_seqlens_kv_padded.data(), te_page_table_k.data(),
         te_page_table_v.data(), te_rng_state.data(), max_seqlen_q, max_seqlen_kv, is_training,
-        attn_scale, p_dropout, qkv_layout, bias_type, attn_mask_type, softmax_type, window_size[0],
-        window_size[1], workspace.data(), at::cuda::getCurrentCUDAStream());
+        return_max_logit, attn_scale, p_dropout, qkv_layout, bias_type, attn_mask_type,
+        softmax_type, window_size[0], window_size[1], workspace.data(),
+        at::cuda::getCurrentCUDAStream());
   });
 
   // allocate memory for workspace and auxiliary output tensors
@@ -249,7 +251,9 @@ std::vector<py::object> fused_attn_fwd(
   };
   // allocate memory for nvte_aux_tensor_pack.tensors
   // f16_max512   : S [b, h, sq, skv]
-  // f16_arbitrary: S [b, h, sq, 1], rng_state [2], (optional) Bias [1, h, sq, skv], (optional) SoftmaxOffset [1, h, 1, 1]
+  // f16_arbitrary:
+  // return_max_logit=false: S [b, h, sq, 1], rng_state [2], (optional) Bias [1, h, sq, skv], (optional) SoftmaxOffset [1, h, 1, 1]
+  // return_max_logit=true: Max [b, h, sq, 1], Sum_Exp [b, h, sq, 1], rng_state [2], (optional) Bias [1, h, sq, skv], (optional) SoftmaxOffset [1, h, 1, 1]
   // fp8          : M [b, h, sq, 1], ZInv [b, h, sq, 1], rng_state [2]
   size_t i = 0;
   at::Tensor output_tensor;
@@ -258,8 +262,8 @@ std::vector<py::object> fused_attn_fwd(
       allocateSpace(nvte_shape_to_vector(nvte_tensor_shape(nvte_aux_tensor_pack.tensors[i])),
                     static_cast<DType>(nvte_tensor_type(nvte_aux_tensor_pack.tensors[i])), false);
   set_tensor_param(i++, output_tensor);
-  // fp8 has an additional softmax stats tensor, ZInv
-  if (qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) {
+  // fp8 has an additional softmax stats tensor, ZInv; return_max_logit=true has an additional Sum_Exp tensor
+  if (return_max_logit || qkv_type == DType::kFloat8E4M3 || qkv_type == DType::kFloat8E5M2) {
     output_tensor =
         allocateSpace(nvte_shape_to_vector(nvte_tensor_shape(nvte_aux_tensor_pack.tensors[i])),
                       static_cast<DType>(nvte_tensor_type(nvte_aux_tensor_pack.tensors[i])), false);
@@ -285,8 +289,9 @@ std::vector<py::object> fused_attn_fwd(
         te_O.data(), &nvte_aux_tensor_pack, te_cu_seqlens_q.data(), te_cu_seqlens_kv.data(),
         te_cu_seqlens_q_padded.data(), te_cu_seqlens_kv_padded.data(), te_page_table_k.data(),
         te_page_table_v.data(), te_rng_state.data(), max_seqlen_q, max_seqlen_kv, is_training,
-        attn_scale, p_dropout, qkv_layout, bias_type, attn_mask_type, softmax_type, window_size[0],
-        window_size[1], workspace.data(), at::cuda::getCurrentCUDAStream());
+        return_max_logit, attn_scale, p_dropout, qkv_layout, bias_type, attn_mask_type,
+        softmax_type, window_size[0], window_size[1], workspace.data(),
+        at::cuda::getCurrentCUDAStream());
   });
 
   // destroy tensor wrappers, but not allocated memory

From fa71964f70e54848a4ba1d6ebf52e90cb5f80b04 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Mon, 20 Oct 2025 16:28:23 -0400
Subject: [PATCH 08/59] [PyTorch] Fix CI failures due to deterministic
 attention backend (#2288)

* Fix CI failures due to deterministic attention

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* some more cleanup

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fix debug test

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 qa/L0_pytorch_debug_unittest/test.sh          |  2 +-
 qa/L0_pytorch_unittest/test.sh                |  4 +--
 tests/pytorch/test_numerics.py                | 30 +------------------
 .../attention/dot_product_attention/utils.py  |  2 +-
 4 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 7f19dda670..9980ccfb05 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -32,6 +32,6 @@ pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/
 
 # standard sanity and numerics tests with initialized debug
 NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
+NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
 
 exit $FAIL
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index cdf0df8887..b23ce3b6cf 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -27,8 +27,8 @@ pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
+PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
+PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py || test_fail "test_jit.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4"
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
index bef076a385..35698b819c 100644
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -43,11 +43,10 @@
 )
 from transformer_engine.pytorch import checkpoint as te_checkpoint
 from transformer_engine.pytorch.cpp_extensions import general_gemm, general_grouped_gemm
-from transformer_engine.pytorch.cpp_extensions.fused_attn import FusedAttnBackend
 from transformer_engine.pytorch.module.base import get_multi_stream_cublas_workspace, get_workspace
 from transformer_engine.common import recipe
 import transformer_engine_torch as tex
-from utils import ModelConfig, reset_rng_states, get_available_attention_backends
+from utils import ModelConfig, reset_rng_states
 
 
 # Only run FP8 tests on supported devices.
@@ -130,23 +129,6 @@
     use_cutlass_grouped_gemm.append(True)
 
 
-def is_fused_attn_available(
-    config: ModelConfig,
-    dtype: torch.dtype,
-    qkv_layout="bshd_bshd_bshd",
-    is_training=True,
-    deterministic=False,
-):
-    _, _, fused_attn_backends = get_available_attention_backends(
-        config,
-        qkv_dtype=dtype,
-        qkv_layout=qkv_layout,
-        is_training=is_training,
-        deterministic=deterministic,
-    )
-    return FusedAttnBackend["F16_arbitrary_seqlen"] in fused_attn_backends
-
-
 def get_causal_attn_mask(sq: int) -> torch.Tensor:
     return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool()
 
@@ -853,8 +835,6 @@ def _test_e2e_checkpointing(bs, dtype, config, checkpoint=False, steps=10, path=
 @pytest.mark.parametrize("model", ["126m"])
 def test_gpt_checkpointing(dtype, bs, model):
     config = model_configs[model]
-    if not is_fused_attn_available(config, dtype, deterministic=True):
-        pytest.skip("No attention backend available.")
     outputs = _test_e2e_checkpointing(bs, dtype, config, checkpoint=False)
     outputs_checkpoint = _test_e2e_checkpointing(bs, dtype, config, checkpoint=True)
 
@@ -901,10 +881,6 @@ def _test_e2e_gpt_accuracy(block, bs, dtype, config):
 @pytest.mark.parametrize("parallel_attention_mlp", all_boolean)
 def test_gpt_accuracy(dtype, bs, model, parallel_attention_mlp):
     config = model_configs[model]
-    if not is_fused_attn_available(
-        config, dtype, qkv_layout="sb3hd", is_training=True, deterministic=True
-    ):
-        pytest.skip("No attention backend available.")
 
     te_gpt = TransformerLayer(
         hidden_size=config.hidden_size,
@@ -1016,10 +992,6 @@ def _test_mha_accuracy(block, bs, dtype, config, mask_type, te=True):
 @pytest.mark.parametrize("mask_type", mask_types)
 def test_mha_accuracy(dtype, bs, model, mask_type):
     config = model_configs[model]
-    if not is_fused_attn_available(
-        config, dtype, qkv_layout="sb3hd", is_training=True, deterministic=True
-    ):
-        pytest.skip("No attention backend available.")
 
     te_mha = MultiheadAttention(
         config.hidden_size,
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index 50b00f2ceb..bb17f66e06 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -1002,7 +1002,7 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
             logger.debug("Disabling FusedAttention for determinism reasons with post_scale_bias")
             use_fused_attention = False
             fused_attention_backend = None
-        if is_training and device_compute_capability >= (10, 0) and cudnn_version <= (9, 14, 0):
+        if is_training and device_compute_capability >= (10, 0):
             logger.debug("Disabling FusedAttention for determinism reasons on Blackwell")
             use_fused_attention = False
             fused_attention_backend = None

From fe9b150939a180cc0db7c7b028a9ce55aeb38f58 Mon Sep 17 00:00:00 2001
From: Kshitij Lakhani <33047503+KshitijLakhani@users.noreply.github.com>
Date: Thu, 30 Oct 2025 10:27:42 -0700
Subject: [PATCH 09/59] [JAX] Fix: Skip determinism tests for bprop for all sm
 >=100 (#2315)

* Fix: Skip determinism tests for bprop for all sm >=100

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>

* Add username to TODO

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>

* Assert in fused attn bwd pass for sm100+

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 tests/jax/test_fused_attn.py                       | 6 +++---
 transformer_engine/jax/cpp_extensions/attention.py | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
index 5b814cb99f..a5d73d9605 100644
--- a/tests/jax/test_fused_attn.py
+++ b/tests/jax/test_fused_attn.py
@@ -378,14 +378,14 @@ def _check_configs(self):
             pytest.skip(
                 "seqlen_q > seqlen_kv is not supported with sliding window attention in cuDNN"
             )
-
+        # TODO(KshitijLakhani): Set the upper limit for skipping this test when cuDNN adds support
         if (
-            get_device_compute_capability(0) == 100
+            get_device_compute_capability(0) >= 100
             and self.dropout_prob == 0.1
             and self.attn_bias_type is not AttnBiasType.NO_BIAS
         ):
             pytest.skip(
-                "For sm100, bprop kernel support for dropout + determinism (bias) is not supported"
+                "For sm100+, bprop kernel support for dropout + determinism (bias) is not supported"
             )
         # Test the MLA case where head dims for qk differ from head dims for v, only if the tensors
         # are provided in BSHD_BSHD_BSHD or THD_THD_THD formats
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
index db2537c38f..c0cb6cda1f 100644
--- a/transformer_engine/jax/cpp_extensions/attention.py
+++ b/transformer_engine/jax/cpp_extensions/attention.py
@@ -2739,10 +2739,13 @@ def fused_attn_bwd(
         assert bias is None
         bias = jnp.zeros(0, dtype=qkv[0].dtype)
 
-    if 100 in get_all_device_compute_capability():
+    # TODO(KshitijLakhani): Add a check for cuDNN version when determinism does get supported on
+    # sm100+
+    compute_capabilities = get_all_device_compute_capability()
+    if any(x >= 100 for x in compute_capabilities):
         assert not (
             attn_bias_type != AttnBiasType.NO_BIAS and dropout_probability != 0
-        ), "For sm100, bprop kernel support for dropout + determinism (bias) is not supported"
+        ), "For sm100+, bprop kernel support for dropout + determinism (bias) is not supported"
 
     fused_config = _FusedAttnConfig(
         attn_bias_type=attn_bias_type,

From 0acd0e7dbe9458273901a90714d507c01495a2e6 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Thu, 30 Oct 2025 15:32:37 -0400
Subject: [PATCH 10/59] [PyTorch] Fix attention backend and tests for `sm120`
 (#2320)

* Fix attention backend and tests for sm120

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Disable MLA only for backward

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 tests/pytorch/attention/test_attention.py     | 22 +++++++-----
 .../attention/dot_product_attention/utils.py  | 35 +++++++++++++++++++
 2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
index 63b877e68f..c23f289547 100644
--- a/tests/pytorch/attention/test_attention.py
+++ b/tests/pytorch/attention/test_attention.py
@@ -60,8 +60,16 @@
     get_available_attention_backends,
 )
 
-# Check if hardware supports FP8
+# Check if hardware supports FP8 attention.
 fp8_available, reason_for_no_fp8 = is_fp8_available(return_reason=True)
+fp8_attn_available, reason_for_no_fp8_attn = fp8_available, reason_for_no_fp8
+device_compute_capability = get_device_compute_capability()
+if fp8_available and (device_compute_capability < (9, 0) or device_compute_capability >= (12, 0)):
+    fp8_attn_available = False
+    reason_for_no_fp8_attn = (
+        "FP8 attention is not supported for compute capability ="
+        f" sm{device_compute_capability[0] * 10 + device_compute_capability[1]}"
+    )
 
 # Reset RNG seed and states
 seed = 1234
@@ -1572,8 +1580,7 @@ def _run_transformer_layer(
 }
 
 
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.skipif(get_device_compute_capability() < (9, 0), reason="FP8 tests require Hopper.")
+@pytest.mark.skipif(not fp8_attn_available, reason=reason_for_no_fp8_attn)
 @pytest.mark.skipif(get_cudnn_version() < (9, 3, 0), reason="cuDNN 9.3.0+ is required.")
 @pytest.mark.parametrize("model", ["large"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@@ -1735,8 +1742,7 @@ def get_model(dtype, config):
 
 
 @pytest.mark.skipif(get_cudnn_version() < (9, 2, 1), reason="cuDNN 9.2.1+ is required.")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.skipif(get_device_compute_capability() < (9, 0), reason="FP8 tests require Hopper+.")
+@pytest.mark.skipif(not fp8_attn_available, reason=reason_for_no_fp8_attn)
 @pytest.mark.parametrize("dtype", param_types_fp8_vs_f16)
 @pytest.mark.parametrize("model", model_configs_fp8_vs_f16.keys())
 @pytest.mark.parametrize("qkv_format", qkv_format_fp8_vs_f16)
@@ -1972,8 +1978,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
 
 
 @pytest.mark.skipif(get_cudnn_version() < (9, 2, 1), reason="cuDNN 9.2.1+ is required.")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.skipif(get_device_compute_capability() < (9, 0), reason="FP8 tests require Hopper+.")
+@pytest.mark.skipif(not fp8_attn_available, reason=reason_for_no_fp8_attn)
 @pytest.mark.parametrize("dtype", param_types_fp8_vs_f16)
 @pytest.mark.parametrize("model", model_configs_fp8_vs_f16.keys())
 @pytest.mark.parametrize("qkv_layout", qkv_layout_fp8_vs_f16)
@@ -2301,8 +2306,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
     ),
     reason=f"""cuDNN {"8.9.3" if cudnn_frontend_version == 0 else "9.2.1"}+ is required.""",
 )
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.skipif(get_device_compute_capability() < (9, 0), reason="FP8 tests require Hopper+.")
+@pytest.mark.skipif(not fp8_attn_available, reason=reason_for_no_fp8_attn)
 @pytest.mark.parametrize("dtype", param_types_fp8)
 @pytest.mark.parametrize("model", models_v1 if cudnn_frontend_version == 1 else models_v0)
 def test_custom_mha_fp8_vs_f16(dtype, model):
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index bb17f66e06..feabfabac7 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -481,6 +481,20 @@ def get_attention_backend(
                 logger.debug("Disabling FusedAttention for FP8 current scaling with cuDNN < 9.14.0")
                 use_fused_attention = False
 
+        if device_compute_capability == (12, 0):
+            if use_flash_attention:
+                logger.debug(
+                    "Disabling FlashAttention as FP8 is not supported"
+                    " for compute capability = sm120"
+                )
+            if use_fused_attention:
+                logger.debug(
+                    "Disabling FusedAttention as FP8 is not supported"
+                    " for compute capability = sm120"
+                )
+            use_flash_attention = False
+            use_fused_attention = False
+
     # Filter: Return max_logit
     if return_max_logit:
         if use_flash_attention:
@@ -560,6 +574,20 @@ def get_attention_backend(
                 qkv_layout,
             )
             use_fused_attention = False
+        if (
+            device_compute_capability == (12, 0)
+            and (head_dim_qk > 128 or head_dim_qk % 8 != 0)
+            and is_training
+        ):
+            if use_fused_attention:
+                logger.debug(
+                    "Disabling FusedAttention as MLA for backward pass is not supported for compute"
+                    " capability = sm120 for a head_dim_qk > 128 or head_dim_qk %%8 != 0. Found:"
+                    " head_dim_qk = %s",
+                    head_dim_qk,
+                )
+            use_fused_attention = False
+
     if use_flash_attention_2 and (
         head_dim_qk > 256
         or head_dim_qk % 8 != 0
@@ -629,6 +657,13 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
                     "padding between sequences, i.e. [a, a, PAD, b, b, b, PAD, c, PAD]"
                 )
             use_flash_attention = False
+        if device_compute_capability == (12, 0):
+            if use_fused_attention:
+                logger.debug(
+                    "Disabling FusedAttention as qkv_format = thd is"
+                    " not supported for compute capability = sm120"
+                )
+            use_fused_attention = False
 
     # Filter: Dropout
     if attention_dropout != 0.0 and use_flash_attention_3:

From 9cc089a25c045ca319bccf2113170137e3ca0d20 Mon Sep 17 00:00:00 2001
From: Kshitij Lakhani <33047503+KshitijLakhani@users.noreply.github.com>
Date: Thu, 30 Oct 2025 15:50:16 -0700
Subject: [PATCH 11/59] [PyT] Bump the min version expected to supported FP8
 current scaling determinism on Blackwell (#2316)

* Bump the min version expected to supported FP8 cs det on Blackwell

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>

* Disable fused attn for cudnn < 9.14 for FP8 CS. Disable fused attn for cudnn < 9.18 for FP8 deterministic CS

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Kshitij Lakhani <klakhani@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../attention/dot_product_attention/utils.py   | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index feabfabac7..6bcc9f25da 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -477,9 +477,21 @@ def get_attention_backend(
             if device_compute_capability < (10, 0):
                 logger.debug("Disabling FusedAttention for FP8 current scaling on arch < sm100")
                 use_fused_attention = False
-            elif cudnn_version < (9, 14, 0):
-                logger.debug("Disabling FusedAttention for FP8 current scaling with cuDNN < 9.14.0")
-                use_fused_attention = False
+            # TODO(cyanguwa): Modify the min cuDNN version supporting FP8 current scaling
+            # determinism for Blackwell
+            else:
+                if cudnn_version < (9, 14, 0):
+                    logger.debug(
+                        "Disabling FusedAttention for FP8 current scaling with cuDNN < 9.14.0"
+                    )
+                    use_fused_attention = False
+                else:
+                    if deterministic and cudnn_version < (9, 18, 0):
+                        logger.debug(
+                            "Disabling FusedAttention for FP8 current scaling requiring determinism"
+                            " with cuDNN < 9.18.0"
+                        )
+                        use_fused_attention = False
 
         if device_compute_capability == (12, 0):
             if use_flash_attention:

From 70f536662ae10a62a54f4ed1ba92e3314c5cfd69 Mon Sep 17 00:00:00 2001
From: jberchtold-nvidia <158520091+jberchtold-nvidia@users.noreply.github.com>
Date: Thu, 30 Oct 2025 16:45:44 -0700
Subject: [PATCH 12/59] [JAX] Ensure JAX reference impl uses an accurate
 backend in our tests (#2322)

Ensure JAX reference impl uses an accurate backend

Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>
---
 qa/L1_jax_distributed_unittest/test.sh | 3 ++-
 qa/L2_jax_distributed_unittest/test.sh | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/qa/L1_jax_distributed_unittest/test.sh b/qa/L1_jax_distributed_unittest/test.sh
index 270f0df15e..42b70a28e0 100644
--- a/qa/L1_jax_distributed_unittest/test.sh
+++ b/qa/L1_jax_distributed_unittest/test.sh
@@ -8,5 +8,6 @@ set -xe
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
-NVTE_JAX_UNITTEST_LEVEL="L1" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*
+# Use --xla_gpu_enable_triton_gemm=false to ensure the reference JAX implementation we are using is accurate.
+XLA_FLAGS="$XLA_FLAGS --xla_gpu_enable_triton_gemm=false" NVTE_JAX_UNITTEST_LEVEL="L1" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*
 SCRIPT_NAME=$TE_PATH/tests/jax/test_multi_process_distributed_grouped_gemm.py bash $TE_PATH/tests/jax/multi_process_launch.sh
diff --git a/qa/L2_jax_distributed_unittest/test.sh b/qa/L2_jax_distributed_unittest/test.sh
index 0b73726502..de5624a596 100644
--- a/qa/L2_jax_distributed_unittest/test.sh
+++ b/qa/L2_jax_distributed_unittest/test.sh
@@ -8,4 +8,5 @@ set -xe
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
-NVTE_JAX_UNITTEST_LEVEL="L2" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*
+# Use --xla_gpu_enable_triton_gemm=false to ensure the reference JAX implementation we are using is accurate.
+XLA_FLAGS="$XLA_FLAGS --xla_gpu_enable_triton_gemm=false" NVTE_JAX_UNITTEST_LEVEL="L2" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/jax/test_distributed_*

From bae9d3acdabeb37dbd3717c4435791f390adc594 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Thu, 11 Dec 2025 20:00:30 +0800
Subject: [PATCH 13/59] [Version] Reset to TransformerEngine v2.9 (#5)

# Description

Add the FlagOS multi-chip backend for TransformerEngine

Fixes # (issue)

## Type of change

- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Change A
- Change B

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes

---------

Co-authored-by: zhaoyinglia <ylzhao@baai.ac.cn>
---
 .../dot_product_attention/backends.py         | 436 ++++++++++++++++++
 transformer_engine/plugins/backend.py         | 166 +++++++
 transformer_engine/plugins/backend_fl.py      |  40 ++
 transformer_engine/plugins/backend_native.py  |  43 ++
 .../plugins/cpp_extensions/__init__.py        |   9 +
 .../plugins/cpp_extensions/fused_adam.py      |  80 ++++
 .../plugins/cpp_extensions/gemm.py            | 109 +++++
 .../cpp_extensions/multi_tensor_apply.py      |  23 +
 .../plugins/cpp_extensions/rmsnorm.py         |  55 +++
 transformer_engine/plugins/import_utils.py    | 113 +++++
 transformer_engine/plugins/logger.py          |  49 ++
 transformer_engine/plugins/module/_common.py  |  36 ++
 transformer_engine/plugins/register.py        | 144 ++++++
 .../dot_product_attention.py                  |   3 +-
 .../pytorch/module/layernorm_linear.py        |  14 +-
 transformer_engine/pytorch/module/linear.py   |   8 +-
 .../pytorch/ops/basic/rmsnorm.py              |   8 +-
 .../pytorch/optimizers/__init__.py            |   5 +-
 .../pytorch/optimizers/fused_adam.py          |   9 +-
 19 files changed, 1332 insertions(+), 18 deletions(-)
 create mode 100644 transformer_engine/plugins/attention/dot_product_attention/backends.py
 create mode 100644 transformer_engine/plugins/backend.py
 create mode 100644 transformer_engine/plugins/backend_fl.py
 create mode 100644 transformer_engine/plugins/backend_native.py
 create mode 100644 transformer_engine/plugins/cpp_extensions/__init__.py
 create mode 100644 transformer_engine/plugins/cpp_extensions/fused_adam.py
 create mode 100644 transformer_engine/plugins/cpp_extensions/gemm.py
 create mode 100644 transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py
 create mode 100644 transformer_engine/plugins/cpp_extensions/rmsnorm.py
 create mode 100644 transformer_engine/plugins/import_utils.py
 create mode 100644 transformer_engine/plugins/logger.py
 create mode 100644 transformer_engine/plugins/module/_common.py
 create mode 100644 transformer_engine/plugins/register.py

diff --git a/transformer_engine/plugins/attention/dot_product_attention/backends.py b/transformer_engine/plugins/attention/dot_product_attention/backends.py
new file mode 100644
index 0000000000..3c9ca43a1e
--- /dev/null
+++ b/transformer_engine/plugins/attention/dot_product_attention/backends.py
@@ -0,0 +1,436 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Attention Backends."""
+from contextlib import nullcontext
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import warnings
+from packaging.version import Version as PkgVersion
+
+import torch
+from transformer_engine.pytorch.utils import (
+    get_device_compute_capability,
+)
+from transformer_engine.pytorch.utils import (
+    nvtx_range_push,
+    nvtx_range_pop,
+)
+from transformer_engine.pytorch.quantized_tensor import (
+    prepare_for_saving,
+    restore_from_saved,
+)
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor
+from transformer_engine.pytorch.constants import (
+    TE_DType,
+    QKVLayouts,
+    dist_group_type,
+)
+from transformer_engine.pytorch.distributed import get_distributed_world_size
+from transformer_engine.pytorch.jit import no_torch_dynamo
+from transformer_engine.pytorch.attention.inference import InferenceParams
+from transformer_engine.pytorch.cpu_offload import (
+    is_cpu_offload_enabled,
+    start_offload,
+    mark_activation_offload,
+    NVTE_CPU_OFFLOAD_V1,
+)
+from transformer_engine.pytorch.cpu_offload_v1 import is_current_layer_offloaded
+
+# Import attention utils
+import transformer_engine.pytorch.attention.dot_product_attention.utils as dpa_utils
+
+from ...import_utils import have_flag_gems
+
+HAVE_FLAG_GEMS = have_flag_gems()
+
+if HAVE_FLAG_GEMS:
+    import flag_gems
+
+
+class AttnFuncFL(torch.autograd.Function):
+    """FusedAttention forward and backward implementation"""
+
+    @staticmethod
+    def forward(
+        ctx,
+        is_training,
+        max_seqlen_q,
+        max_seqlen_kv,
+        cu_seqlens_q,
+        cu_seqlens_kv,
+        page_table_k,
+        page_table_v,
+        q,
+        k,
+        v,
+        attn_scale,
+        dropout_p,
+        qkv_layout,
+        attn_mask_type,
+        window_size,
+        rng_gen,
+        deterministic,
+        layer_number,
+    ):
+        # pylint: disable=missing-function-docstring
+        # add NVTX range
+        nvtx_label = "transformer_engine.AttnFuncFL.forward"
+        nvtx_range_push(f"{nvtx_label}")
+
+        if is_cpu_offload_enabled():
+            start_offload(q, k, v, offload_base_tensor=True)
+
+
+        # input types are inferred from the real data while output types are controlled by fp8_output
+        # fp8_output should be set upstream as (DPA.fp8 and DPA.fp8_meta["recipe"].fp8_mha)
+        assert isinstance(k, q.__class__) and isinstance(
+            v, q.__class__
+        ), "q, k, v must be of the same class, e.g. torch.Tensor or Float8Tensor."
+
+        # get nominal data type for out
+        # FP16/BF16 attention: torch.float16 or torch.bfloat16
+        # FP8 attention:       torch.float16 or torch.bfloat16
+        out_nominal_dtype = q.dtype
+
+        max_logit = None
+
+        is_causal = attn_mask_type == 'causal'
+        q_permuted = q.permute(1, 2, 0, 3) #[s, b, n_h, h] -> [b, n_h, s, h]
+        k_permuted = k.permute(1, 2, 0, 3)
+        v_permuted = v.permute(1, 2, 0, 3)
+        (out_permuted, m) = flag_gems.scaled_dot_product_attention_forward(
+            q_permuted,
+            k_permuted,
+            v_permuted,
+            attn_mask=None,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=attn_scale,
+            enable_gqa=True,
+        )
+        out = out_permuted.permute(2, 0, 1, 3) # [b, n_h, s, h] -> [s, b, n_h, h]
+        aux_ctx_tensors = [out_permuted, m]
+        max_logit = None
+
+        out_ret = out
+        qkvo_tensors = (q_permuted, k_permuted, v_permuted, out_permuted)
+
+        nvtx_range_pop(f"{nvtx_label}")
+
+        # assume fwd and bwd always use the same high precision, i.e. torch.float16 or torch.bfloat16
+        # used when some tensors are base tensors and loose the "dtype" attribute
+        ctx.nominal_dtype = out_nominal_dtype
+
+        if is_cpu_offload_enabled() and NVTE_CPU_OFFLOAD_V1:
+            tensor_list = [q, k, v, out]
+
+            mark_activation_offload(*tensor_list)
+            mark_activation_offload(*aux_ctx_tensors)
+
+        tensors_to_save, tensor_objects = prepare_for_saving(
+            *qkvo_tensors,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            *aux_ctx_tensors,
+        )
+        ctx.save_for_backward(*tensors_to_save)
+        ctx.tensor_objects = tensor_objects
+
+        ctx.layer_number = layer_number
+
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_kv = max_seqlen_kv
+        ctx.attn_scale = attn_scale
+        ctx.dropout_p = dropout_p
+        ctx.is_causal = is_causal
+
+        if NVTE_CPU_OFFLOAD_V1:
+            # If interleaved tensor is offloaded, reloaded tensor will be
+            # non-interleaved, so we need to modify the QKV layout
+            # for backward
+            if is_current_layer_offloaded() and is_cpu_offload_enabled():
+                reload_layout = ""
+                split_list = qkv_layout.split("_")
+                for split in split_list:
+                    temp_layout = ""
+                    rep_count = 1
+                    for s in split:
+                        if s.isalpha():
+                            temp_layout = temp_layout + s
+                        else:
+                            rep_count = int(s)
+                    for _ in range(rep_count):
+                        reload_layout = reload_layout + temp_layout + "_"
+                ctx.qkv_layout = reload_layout[:-1]
+            else:
+                ctx.qkv_layout = qkv_layout
+        else:
+            ctx.qkv_layout = qkv_layout
+
+        ctx.attn_mask_type = attn_mask_type
+        ctx.window_size = window_size
+        ctx.deterministic = deterministic
+
+        return out_ret
+
+    @staticmethod
+    def backward(ctx, d_out, *_args):
+        # pylint: disable=missing-function-docstring
+
+        # d_out is expected to be in FP8 if is_output_fp8=True,
+        # but in the case it's not, convert it to FP8 before any operation
+        d_out = d_out.contiguous()
+        (
+            q_permuted,
+            k_permuted,
+            v_permuted,
+            out_permuted,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            *other_tensors,
+        ) = restore_from_saved(ctx.tensor_objects, ctx.saved_tensors)
+
+        aux_ctx_tensors = other_tensors
+
+        if not aux_ctx_tensors[0].is_contiguous():
+            aux_ctx_tensors[0] = aux_ctx_tensors[0].contiguous()
+        if not aux_ctx_tensors[1].is_contiguous():
+            aux_ctx_tensors[1] = aux_ctx_tensors[1].contiguous()
+        out_permuted, m = aux_ctx_tensors
+        rest = [None]
+
+        with torch.cuda.nvtx.range("AttnFuncFL.backward"):
+            # get nominal data type of dq, dk, dv
+            # FP16/BF16 attention: torch.float16 or torch.bfloat16
+            # FP8 attention:       torch.float16 or torch.bfloat16
+            dqkv_nominal_dtype = ctx.nominal_dtype
+
+            dqkv_te_dtype = TE_DType[d_out.dtype]
+
+            q_permuted, k_permuted, v_permuted, m = map(lambda x: x.contiguous() if not x.is_contiguous() else x, (q_permuted, k_permuted, v_permuted, m))
+            d_out_permuted = d_out.permute(1, 2, 0, 3).contiguous() # [s, b, n_h, h] -> [b, n_h, s, h]
+            dq_permuted, dk_permuted, dv_permuted = flag_gems.scaled_dot_product_attention_backward(
+                d_out_permuted,
+                q_permuted,
+                k_permuted,
+                v_permuted,
+                out_permuted,
+                m,
+                attn_mask=None,
+                dropout_p=ctx.dropout_p,
+                is_causal=ctx.is_causal,
+                scale=ctx.attn_scale,
+                enable_gqa=True,
+            )
+            dq = dq_permuted.permute(2, 0, 1, 3)
+            dk = dk_permuted.permute(2, 0, 1, 3)
+            dv = dv_permuted.permute(2, 0, 1, 3)
+            rest = None
+
+        return (
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            dq,
+            dk,
+            dv,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class FlashAttentionFL(torch.nn.Module):
+    """Dot product attention
+    """
+
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = nullcontext,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.softmax_scale = softmax_scale
+        self.attention_dropout = attention_dropout
+        self.attention_dropout_ctx = attention_dropout_ctx
+        self.attention_type = attention_type
+        self.use_FAv2_bwd = os.getenv(
+            "NVTE_FUSED_ATTN_USE_FAv2_BWD", "0"
+        ) == "1" and get_device_compute_capability() == (9, 0)
+        self.layer_number = 1 if layer_number is None else layer_number
+        self.deterministic = deterministic
+
+        def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unused-argument
+            """
+            Temporarily remove fused_attention._extra_state as a missing key
+            or an unexpected key when loading Transformer Engine checkpoints.
+            Please store FP8 metadata as DotProductAttention's _extra_state,
+            rather than FusedAttention's _extra_state. This hook will be
+            phased out in Transformer Engine 2.0.
+            """
+            for key in incompatible_keys.missing_keys:
+                if "fused_attention._extra_state" in key:
+                    incompatible_keys.missing_keys.remove(key)
+            for key in incompatible_keys.unexpected_keys:
+                if "fused_attention._extra_state" in key:
+                    incompatible_keys.unexpected_keys.remove(key)
+                    warnings.warn(
+                        "fused_attention._extra_state is not loaded from checkpoint. Please map "
+                        "FusedAttention's _extra_state to DotProductAttention's _extra_state."
+                    )
+
+        self.register_load_state_dict_post_hook(remove_extra_states_check)
+
+    @no_torch_dynamo()
+    def forward(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Union[dist_group_type, List[dist_group_type]]] = None,
+        cp_global_ranks: List[int] = None,
+        cp_stream: torch.cuda.Stream = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers=None,
+        inference_params: Optional[InferenceParams] = None,
+        flash_attention_backend: Optional[PkgVersion] = PkgVersion("0"),
+        fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
+    ) -> torch.Tensor:
+        assert HAVE_FLAG_GEMS, "GEMS is not installed"
+        assert all(
+            x.dtype in [torch.float16, torch.bfloat16] or isinstance(x, Float8Tensor)
+            for x in [query_layer, key_layer, value_layer]
+        ), "FLAttention only supports FP16 and BF16 data types, or Float8Tensors."
+        assert (
+            query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
+        ), "FLAttention only supports CUDA tensors."
+        assert (
+            qkv_layout in QKVLayouts
+        ), f"FLAttention does not support qkv_layout = {qkv_layout}!"
+
+        cp_size = 1
+        if isinstance(cp_group, dist_group_type):
+            cp_size = get_distributed_world_size(cp_group)
+        elif isinstance(cp_group, list):
+            for group in cp_group:
+                cp_size *= get_distributed_world_size(group)
+        context_parallel = cp_size > 1
+        assert not context_parallel, "FLAttention do not support context parallel now"
+
+        # get q_format and kv_format for training and inference
+        qkv_format, q_format, kv_format = dpa_utils.get_qkv_format(qkv_layout, inference_params)
+
+        # cuDNN can work with 0-length sequences in the batch for both bshd/sbhd and thd formats
+        # however, for bshd/sbhd, q/k/v tensors need to have the same batch size as indicated by
+        # cu_seqlens, whereas thd does not have this requirement
+        # e.g. if q_format = bshd, and q.shape = [3, 1, 16, 64], we should have k.shape[0] =
+        # v.shape[0] = q.shape[0], and cu_seqlens_q.shape = cu_seqlens_kv.shape = [4]
+        if q_format in ["bshd", "sbhd"] or kv_format in ["bshd", "sbhd"]:
+            batch_size = query_layer.shape[0] if q_format == "bshd" else query_layer.shape[1]
+            cu_seqlens_q = cu_seqlens_q[: batch_size + 1]
+            cu_seqlens_kv = cu_seqlens_kv[: batch_size + 1]
+
+        page_table = None
+        if inference_params is None:
+            if qkv_format in ["sbhd", "bshd"]:
+                if qkv_format == "sbhd":
+                    batch_size = query_layer.shape[1]
+                    max_seqlen_q = query_layer.shape[0]
+                    max_seqlen_kv = key_layer.shape[0]
+                if qkv_format == "bshd":
+                    batch_size = query_layer.shape[0]
+                    max_seqlen_q = query_layer.shape[1]
+                    max_seqlen_kv = key_layer.shape[1]
+                max_seqlen_q *= cp_size
+                max_seqlen_kv *= cp_size
+                if "padding" in attn_mask_type:
+                    assert (
+                        not context_parallel
+                    ), "Padding mask not supported with context parallelism!"
+                    if cu_seqlens_q is None or cu_seqlens_kv is None:
+                        if attention_mask is None:
+                            raise RuntimeError(
+                                "Please provide attention_mask or cu_seqlens for padding!"
+                            )
+                        if self.attention_type == "self":
+                            cu_seqlens_q = dpa_utils.get_cu_seqlens(attention_mask)
+                            cu_seqlens_kv = cu_seqlens_q
+                        else:
+                            cu_seqlens_q = dpa_utils.get_cu_seqlens(attention_mask[0])
+                            cu_seqlens_kv = dpa_utils.get_cu_seqlens(attention_mask[1])
+                else:
+                    if cu_seqlens_q is None:
+                        cu_seqlens_q = dpa_utils.get_full_cu_seqlens(
+                            batch_size,
+                            max_seqlen_q,
+                            query_layer.device,
+                        )
+                    if cu_seqlens_kv is None:
+                        cu_seqlens_kv = dpa_utils.get_full_cu_seqlens(
+                            batch_size,
+                            max_seqlen_kv,
+                            key_layer.device,
+                        )
+            if qkv_format == "thd":
+                assert (
+                    max_seqlen_q is not None
+                    and max_seqlen_kv is not None
+                    and cu_seqlens_q is not None
+                    and cu_seqlens_kv is not None
+                ), "max_seqlen_q/kv and cu_seqlens_q/kv can not be None when qkv_format is thd!"
+        elif inference_params.is_paged:
+            page_table = inference_params.cache_manager.page_table
+
+        with self.attention_dropout_ctx():
+            _attn_impl = AttnFuncFL
+            output = _attn_impl.apply(
+                self.training,
+                max_seqlen_q,
+                max_seqlen_kv,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                page_table,
+                page_table,
+                query_layer,
+                key_layer,
+                value_layer,
+                self.softmax_scale,
+                self.attention_dropout if self.training else 0.0,
+                qkv_layout,
+                attn_mask_type,
+                window_size,
+                None,  # rng_gen
+                self.deterministic,
+                self.layer_number,
+            )
+
+        # ...hd -> ...(hd)
+        return output.view(*output.shape[:-2], -1)
diff --git a/transformer_engine/plugins/backend.py b/transformer_engine/plugins/backend.py
new file mode 100644
index 0000000000..812093f86c
--- /dev/null
+++ b/transformer_engine/plugins/backend.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from .register import get_backend, get_selected_backend, register_backend
+from .logger import get_logger
+logger = get_logger()
+
+from .import_utils import have_flag_gems
+
+HAVE_FLAG_GEMS = have_flag_gems()
+
+class BackendDispatch:
+    """
+    Transformer Engine Backend that routes operations to appropriate implementations.
+    
+    Uses caching to avoid repeated flag checks and backend lookups for the same operation.
+    """
+    
+    def __init__(self):
+        """Initialize the backend with an empty implementation cache."""
+        # Cache for operation implementations: {operation: impl}
+        self._impl_cache: Dict[str, Any] = {}
+    
+    def _get_impl(self, operation: str):
+        """
+        Get the implementation for an operation based on flags.
+        Falls back to native if the selected backend doesn't have the operation.
+        Uses caching to avoid repeated lookups.
+        
+        Args:
+            operation: Name of the operation (e.g., "gemm", "rmsnorm_fwd")
+        
+        Returns:
+            The implementation function/class to use
+        
+        Raises:
+            RuntimeError: If native backend doesn't have the operation
+        """
+        # Check cache first
+        if operation in self._impl_cache:
+            return self._impl_cache[operation]
+        
+        # Get selected backend based on global environment variable
+        selected_backend = get_selected_backend()
+        native_backend = get_backend("native")
+        
+        # Try to get implementation from selected backend, fallback to native if not found
+        impl = selected_backend.get(operation)
+        if impl is None:
+            logger.debug(
+                f"Backend '{selected_backend.name}' doesn't have '{operation}', "
+                f"falling back to native"
+            )
+            impl = native_backend.get(operation)
+            if impl is None:
+                raise RuntimeError(
+                    f"Operation '{operation}' is not registered in native backend. "
+                    f"Available operations: {sorted(native_backend._implementations.keys())}"
+                )
+        
+        # Cache the implementation for future use
+        logger.info(f"Backend '{selected_backend.name}' use implementation of '{operation}' for training")
+        self._impl_cache[operation] = impl
+        
+        return impl
+    
+    def clear_cache(self):
+        """Clear the implementation cache. Useful if flags change at runtime."""
+        self._impl_cache.clear()
+        logger.debug("Cleared implementation cache")
+
+    def gemm(self, *args, **kwargs):
+        """GEMM operation with automatic fallback to native."""
+        impl = self._get_impl("gemm")
+        try:
+            return impl(*args, **kwargs)
+        except Exception as e:
+            logger.warning(f"GEMM implementation failed, falling back to native: {e}")
+            native_backend = get_backend("native")
+            return native_backend.get("gemm")(*args, **kwargs)
+    
+    def apply_normalization(self, *args, **kwargs):
+        """Apply normalization with automatic fallback to native."""
+        impl = self._get_impl("apply_normalization")
+        try:
+            return impl(*args, **kwargs)
+        except Exception as e:
+            logger.warning(f"Apply Normalization implementation failed, falling back to native: {e}")
+            native_backend = get_backend("native")
+            return native_backend.get("apply_normalization")(*args, **kwargs)
+    
+    def rmsnorm_fwd(self, *args, **kwargs):
+        """RMSNorm forward pass with automatic fallback to native."""
+        impl = self._get_impl("rmsnorm_fwd")
+        try:
+            return impl(*args, **kwargs)
+        except Exception as e:
+            logger.warning(f"RmsNorm FWD implementation failed, falling back to native: {e}")
+            native_backend = get_backend("native")
+            return native_backend.get("rmsnorm_fwd")(*args, **kwargs)
+    
+    def rmsnorm_bwd(self, *args, **kwargs):
+        """RMSNorm backward pass with automatic fallback to native."""
+        impl = self._get_impl("rmsnorm_bwd")
+        try:
+            return impl(*args, **kwargs)
+        except Exception as e:
+            logger.warning(f"RmsNorm BWD implementation failed, falling back to native: {e}")
+            native_backend = get_backend("native")
+            trimmed_args = args[:-1]  # cut eps
+            return native_backend.get("rmsnorm_bwd")(*trimmed_args, **kwargs)
+    
+    def multi_tensor_adam(self):
+        """Multi-tensor Adam optimizer with automatic fallback to native."""
+        impl = self._get_impl("adam")
+        try:
+            return impl
+        except Exception as e:
+            logger.warning(f"Adam implementation failed, falling back to native: {e}")
+            native_backend = get_backend("native")
+            return native_backend.get("adam")
+    
+    def flash_attention(self, *args, **kwargs):
+        """Flash Attention with automatic fallback to native."""
+        impl = self._get_impl("flash_attention")
+        try:
+            return impl(*args, **kwargs)
+        except Exception as e:
+            logger.warning(f"Flash Attention implementation failed, falling back to native: {e}")
+            native_backend = get_backend("native")
+            return native_backend.get("flash_attention")(*args, **kwargs)
+
+
+# Backend initialization state
+_backends_initialized = False
+_backend_instance = None
+
+def _initialize_backends():
+    """
+    Initialize all backend registrations.
+    This function is called automatically on first use.
+    """
+    global _backends_initialized, _backend_instance
+    
+    if _backends_initialized:
+        return
+    
+    from .backend_native import register_backend_native
+    register_backend_native()
+    if HAVE_FLAG_GEMS:
+        from .backend_fl import register_backend_fl
+        register_backend_fl()
+    
+    _backend_instance = BackendDispatch()
+    _backends_initialized = True
+    
+    logger.info("Backend system initialized successfully")
+
+# Create backend instance on module import
+_initialize_backends()
+backend = _backend_instance
diff --git a/transformer_engine/plugins/backend_fl.py b/transformer_engine/plugins/backend_fl.py
new file mode 100644
index 0000000000..fb73dff8e8
--- /dev/null
+++ b/transformer_engine/plugins/backend_fl.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from .import_utils import safety_import
+from .register import register_backend
+from .logger import get_logger
+logger = get_logger()
+
+
+### GEMM
+general_gemm_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'general_gemm_fl')
+### RMSNORM
+apply_normalization_fl = safety_import('transformer_engine.plugins.module._common', 'apply_normalization_fl')
+rmsnorm_bwd_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'rmsnorm_bwd_fl')
+rmsnorm_fwd_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'rmsnorm_fwd_fl')
+### AdamW
+multi_tensor_adam_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'multi_tensor_adam_fl')
+### Flash-Attn
+# Use lazy=True to avoid circular imports
+FlashAttentionFL = safety_import(
+    'transformer_engine.plugins.attention.dot_product_attention.backends',
+    'FlashAttentionFL',
+    lazy=True
+)
+
+def register_backend_fl():
+    # Register TE-FL backend
+    register_backend("te_fl", {
+        "gemm": general_gemm_fl,
+        "apply_normalization": apply_normalization_fl,
+        "rmsnorm_fwd": rmsnorm_fwd_fl,
+        "rmsnorm_bwd": rmsnorm_bwd_fl,
+        "adam": multi_tensor_adam_fl,
+        "flash_attention": FlashAttentionFL,
+    })
diff --git a/transformer_engine/plugins/backend_native.py b/transformer_engine/plugins/backend_native.py
new file mode 100644
index 0000000000..b9a4f5b13a
--- /dev/null
+++ b/transformer_engine/plugins/backend_native.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from .import_utils import safety_import
+from .register import register_backend
+from .logger import get_logger
+logger = get_logger()
+
+
+### GEMM
+general_gemm_native = safety_import('transformer_engine.pytorch.cpp_extensions', 'general_gemm')
+### RMSNORM
+apply_normalization_native = safety_import('transformer_engine.pytorch.module._common', 'apply_normalization')
+rmsnorm_bwd_native = safety_import('transformer_engine_torch', 'rmsnorm_bwd')
+rmsnorm_fwd_native = safety_import('transformer_engine_torch', 'rmsnorm_fwd')
+### AdamW
+multi_tensor_adam_native = safety_import('transformer_engine_torch', 'multi_tensor_adam')
+### Flash-Attn
+# Use lazy=True to avoid circular imports
+FlashAttentionNative = safety_import(
+    'transformer_engine.pytorch.attention.dot_product_attention.backends',
+    'FlashAttention',
+    lazy=True
+)
+
+# Register native backend
+def register_backend_native():
+    # Note: native_rmsnorm_bwd doesn't take eps as the last argument, so we wrap it
+    def rmsnorm_bwd_native_wrapper(*args, **kwargs):
+        return rmsnorm_bwd_native(*args[:-1], **kwargs)
+    register_backend("native", {
+        "gemm": general_gemm_native,
+        "apply_normalization": apply_normalization_native,
+        "rmsnorm_fwd": rmsnorm_fwd_native,
+        "rmsnorm_bwd": rmsnorm_bwd_native_wrapper,
+        "adam": multi_tensor_adam_native,
+        "flash_attention": FlashAttentionNative,
+    })
diff --git a/transformer_engine/plugins/cpp_extensions/__init__.py b/transformer_engine/plugins/cpp_extensions/__init__.py
new file mode 100644
index 0000000000..286672141c
--- /dev/null
+++ b/transformer_engine/plugins/cpp_extensions/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2022-2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Python interface for c++ extensions"""
+from .gemm import *
+from .rmsnorm import *
+from .fused_adam import *
+from .multi_tensor_apply import *
diff --git a/transformer_engine/plugins/cpp_extensions/fused_adam.py b/transformer_engine/plugins/cpp_extensions/fused_adam.py
new file mode 100644
index 0000000000..d7c9a09baa
--- /dev/null
+++ b/transformer_engine/plugins/cpp_extensions/fused_adam.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from itertools import chain
+from typing import Optional, List, Union
+import warnings
+import os
+
+import torch
+
+def multi_tensor_adam_fl(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    lr: float,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    mode: int,
+    bias_correction: int,
+    weight_decay: float,
+    inv_scale: Optional[float] = 1.0,
+    out_dtype: Optional[torch.dtype] = None,
+) -> None:
+
+    num_lists = len(tensor_lists)
+    assert num_lists in [4, 5], f"Expected 4 or 5 tensor lists, got {num_lists}"
+
+    num_tensors = len(tensor_lists[0])
+    assert num_tensors > 0, "No tensors provided"
+
+    for i, lst in enumerate(tensor_lists):
+        assert len(lst) == num_tensors, f"List {i} has {len(lst)} tensors, expected {num_tensors}"
+
+    bias_correction1 = 1.0
+    bias_correction2 = 1.0
+    if bias_correction == 1:
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+    is_adamw = (mode == 1)
+
+    for i in range(num_tensors):
+        g = tensor_lists[0][i]  # grad
+        p = tensor_lists[1][i]  # param
+        m = tensor_lists[2][i]  #
+        v = tensor_lists[3][i]  #
+        p_master = tensor_lists[4][i] if num_lists == 5 else None
+
+        if not g.is_contiguous():
+            g = g.contiguous()
+
+        if inv_scale is not None and inv_scale != 1.0:
+            g = g * inv_scale
+
+        m.mul_(beta1).add_(g, alpha=1 - beta1)
+        # v.mul_(beta2).addcmul_(g, g, value=1 - beta2)
+        v.mul_(beta2).add_(g.mul(g).mul_(1 - beta2))
+
+        m_corr = m.clone()
+        v_corr = v.clone()
+        if bias_correction == 1:
+            m_corr = m_corr / bias_correction1
+            v_corr = v_corr / bias_correction2
+
+        update = m_corr / (v_corr.sqrt() + eps)
+
+        if is_adamw:
+            p.data.mul_(1 - lr * weight_decay)
+        else:
+            update.add_(p, alpha=weight_decay)
+
+        p.data.add_(update, alpha=-lr)
+
+        if p_master is not None:
+            p_master.data.copy_(p.data)
+            out_dtype = p_master.dtype if out_dtype is None else out_dtype
+            p.data = p.data.to(out_dtype)
diff --git a/transformer_engine/plugins/cpp_extensions/gemm.py b/transformer_engine/plugins/cpp_extensions/gemm.py
new file mode 100644
index 0000000000..e0310dd902
--- /dev/null
+++ b/transformer_engine/plugins/cpp_extensions/gemm.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Iterable, Optional, Tuple, Union, List
+import os
+import functools
+import torch
+import transformer_engine_torch as tex
+from transformer_engine.pytorch.constants import TE_DType
+
+from transformer_engine.pytorch.quantized_tensor import Quantizer
+
+from ..import_utils import have_flag_gems
+
+HAVE_FLAG_GEMS = have_flag_gems()
+if HAVE_FLAG_GEMS:
+    import flag_gems
+
+__all__ = [
+    "general_gemm_fl",
+]
+
+
+def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
+    """Validate whether a GEMM scaling factor is consistent with its usage"""
+    if required:
+        return scale if scale is not None else 1.0
+    if scale not in (0.0, None):
+        raise ValueError("scale must be zero")
+    return 0.0
+
+
+def general_gemm_fl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    out_dtype: Optional[torch.dtype] = None,
+    quantization_params: Optional[Quantizer] = None,
+    gelu: bool = False,
+    gelu_in: torch.Tensor = None,
+    alpha: float = 1.0,
+    beta: Optional[float] = None,
+    accumulate: bool = False,
+    layout: str = "TN",
+    out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    use_split_accumulator: bool = False,
+    grad: bool = False,
+    ub: Union[tex.CommOverlap, tex.CommOverlapP2P] = None,
+    ub_type: tex.CommOverlapType = None,
+    extra_output: Optional[torch.Tensor] = None,
+    bulk_overlap: bool = False,
+) -> Iterable[Optional[torch.Tensor]]:
+
+    assert HAVE_FLAG_GEMS, "Triton-Based General Gemm needs FlagGems"
+    assert not gelu and gelu_in is None, "Triton-Based General Gemm do not support gelu now"
+    assert ub is None and ub_type is None, "Triton-Based General Gemm do not support ub comm in kernels"
+    assert quantization_params is None, "Triton-Based General Gemm do not support quantization now"
+    assert bias is None, "Triton-Based General Gemm do not support bias now"
+    assert layout in ("TN", "NN", "NT"), f"GEMM layout {layout} not supported."
+
+    transa = layout[0] == "T"
+    transb = layout[1] == "T"
+
+    alpha = validate_gemm_scale(alpha, True)
+    beta = validate_gemm_scale(beta, accumulate)
+
+    if out is not None:
+        if not out.is_contiguous():
+            raise ValueError("Output tensor is not contiguous.")
+
+    # Use bfloat16 as default bias_dtype
+    bias_dtype = TE_DType[torch.bfloat16 if bias is None else bias.dtype]
+
+    s = -1
+    b = -1
+    orig_A_shape = A.shape
+    orig_B_shape = B.shape
+    shape_a_changed = False
+    shape_b_changed = False
+
+    if A.ndim == 3:
+        A = A.view(-1, A.shape[-1])
+        shape_a_changed = True
+
+    if B.ndim == 3:
+        s, b, _ = B.shape
+        B = B.view(-1, B.shape[-1])
+        shape_b_changed = True
+
+    A_comp = A.T if transa else A
+    B_comp = B.T if transb else B
+
+    out1 = flag_gems.mm(B_comp, A_comp)
+
+    if shape_b_changed:
+        out1 = out1.view(s, b, -1)
+
+    if out_dtype is not None and out1.dtype != out_dtype:
+        out1 = out1.to(out_dtype)
+
+    bias_grad = None
+    gelu_input = None
+    extra_output = None
+    if out is not None:
+        out.add_(out1)
+        return out, bias_grad, gelu_input, extra_output
+    else:
+        return out1, bias_grad, gelu_input, extra_output
diff --git a/transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py b/transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py
new file mode 100644
index 0000000000..6373b999a8
--- /dev/null
+++ b/transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+from torch.distributed._tensor import DTensor
+
+
+def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
+    """
+    Computes l2 norm for a list of contiguous tensors
+    works as a drop-in replacement for amp_C.multi_tensor_l2norm
+    """
+    l2 = [[(torch.norm(tensor)) for tensor in tensor_list] for tensor_list in tensor_lists]
+    l2_reduced = torch.norm(torch.tensor(l2))
+    l2_cuda = torch.tensor([float(l2_reduced)], dtype=torch.float, device="cuda")
+    return l2_cuda, None
+
+
+def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
+    """Works as a drop-in replacement for amp_C.multi_tensor_scale."""
+    for src, dst in zip(tensor_lists[0], tensor_lists[1]):
+        dst.copy_(src * scale)
diff --git a/transformer_engine/plugins/cpp_extensions/rmsnorm.py b/transformer_engine/plugins/cpp_extensions/rmsnorm.py
new file mode 100644
index 0000000000..af8b3bf096
--- /dev/null
+++ b/transformer_engine/plugins/cpp_extensions/rmsnorm.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from ..import_utils import safety_import, have_flag_gems
+
+### RMSNORM
+HAVE_FLAG_GEMS = have_flag_gems()
+
+if HAVE_FLAG_GEMS:
+    import flag_gems
+
+def rmsnorm_fwd_fl(
+    input,
+    weight,
+    eps,
+    ln_out,
+    quantizer,
+    odtype,
+    sm_margin,
+    zero_centered_gamma,
+):
+    assert HAVE_FLAG_GEMS, "GEMS is not installed"
+    y, rstdevs = flag_gems.rms_norm_forward(
+        input,
+        [input.shape[-1]],
+        weight,
+        eps,
+    )
+    return y, None, rstdevs
+
+
+def rmsnorm_bwd_fl(
+    dy,
+    x,
+    rsigma,
+    gamma,
+    sm_margin,
+    zero_centered_gamma,
+    eps,
+):
+    assert HAVE_FLAG_GEMS, "GEMS is not installed"
+    dx, dw = flag_gems.rms_norm_backward(
+        dy,
+        x,
+        rsigma,
+        [x.shape[-1]],
+        gamma,
+        eps,
+    )
+    return dx, dw
diff --git a/transformer_engine/plugins/import_utils.py b/transformer_engine/plugins/import_utils.py
new file mode 100644
index 0000000000..76a8dd8846
--- /dev/null
+++ b/transformer_engine/plugins/import_utils.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import importlib
+from typing import Any, Optional
+
+from .logger import get_logger
+
+logger = get_logger()
+
+# Safety import cache to avoid circular imports and improve performance
+_import_cache: dict[str, Any] = {}
+
+# Cache for HAVE_FLAG_GEMS check to avoid repeated imports
+_HAVE_FLAG_GEMS_CACHE: Optional[bool] = None
+
+
+class _LazyImport:
+    """Lazy import proxy that defers actual import until first use."""
+    
+    def __init__(self, module_path: str, name: Optional[str] = None):
+        self._module_path = module_path
+        self._name = name
+        self._cache_key = f"{module_path}.{name}" if name else module_path
+        self._imported = None
+    
+    def _import(self):
+        """Perform the actual import."""
+        if self._imported is None:
+            if self._cache_key in _import_cache:
+                self._imported = _import_cache[self._cache_key]
+            else:
+                module = importlib.import_module(self._module_path)
+                if self._name:
+                    self._imported = getattr(module, self._name)
+                else:
+                    self._imported = module
+                _import_cache[self._cache_key] = self._imported
+        return self._imported
+    
+    def __getattr__(self, name: str) -> Any:
+        """Delegate attribute access to the imported object."""
+        return getattr(self._import(), name)
+    
+    def __call__(self, *args, **kwargs) -> Any:
+        """Allow calling if the imported object is callable."""
+        return self._import()(*args, **kwargs)
+    
+    def __repr__(self) -> str:
+        """String representation."""
+        if self._imported is None:
+            return f"<LazyImport: {self._cache_key} (not loaded)>"
+        return repr(self._imported)
+
+
+def safety_import(module_path: str, name: Optional[str] = None, lazy: bool = False) -> Any:
+    """
+    Safely import a module or attribute with lazy loading and caching.
+    
+    This function helps avoid circular imports by deferring imports until
+    they are actually needed, and caches the result for performance.
+    
+    Args:
+        module_path: Full module path
+        name: Optional attribute name to import from the module (e.g., 'FLAttention')
+              If None, returns the module itself.
+        lazy: If True, returns a lazy proxy that defers import until first use.
+              If False (default), imports immediately but caches the result.
+              Use lazy=True when there's a risk of circular imports.
+    
+    Returns:
+        The imported module or attribute (or a lazy proxy if lazy=True).
+    """
+    cache_key = f"{module_path}.{name}" if name else module_path
+    
+    if lazy:
+        # Return lazy proxy that defers import
+        return _LazyImport(module_path, name)
+    
+    # Immediate import with caching
+    if cache_key not in _import_cache:
+        module = importlib.import_module(module_path)
+        if name:
+            _import_cache[cache_key] = getattr(module, name)
+        else:
+            _import_cache[cache_key] = module
+    
+    return _import_cache[cache_key]
+
+
+def have_flag_gems() -> bool:
+    """
+    Check if flag_gems is installed and available.
+    
+    This function caches the result to avoid repeated import attempts.
+    On first check, logs whether flag_gems is available.
+    
+    Returns:
+        True if flag_gems is available, False otherwise.
+    """
+    global _HAVE_FLAG_GEMS_CACHE
+    
+    if _HAVE_FLAG_GEMS_CACHE is None:
+        try:
+            import flag_gems
+            _HAVE_FLAG_GEMS_CACHE = True
+            logger.info("flag_gems is available. FL backend implementations can be used.")
+        except ImportError:
+            _HAVE_FLAG_GEMS_CACHE = False
+            logger.info("flag_gems is not installed. Only native backend implementations will be used.")
+    
+    return _HAVE_FLAG_GEMS_CACHE
diff --git a/transformer_engine/plugins/logger.py b/transformer_engine/plugins/logger.py
new file mode 100644
index 0000000000..83a577024f
--- /dev/null
+++ b/transformer_engine/plugins/logger.py
@@ -0,0 +1,49 @@
+import logging
+import sys
+import os
+
+
+class Logger:
+    def __init__(self, name, level=logging.INFO):
+        self.logger = logging.getLogger(name)
+        self.logger.setLevel(level)
+        self.logger.propagate = False
+
+        # Clear existing handlers
+        for handler in self.logger.handlers[:]:
+            self.logger.removeHandler(handler)
+
+        formatter = logging.Formatter(
+            "[%(asctime)s %(name)s %(filename)s:%(lineno)d %(levelname)s] %(message)s"
+        )
+
+        stream_handler = logging.StreamHandler(sys.stdout)
+        stream_handler.setFormatter(formatter)
+
+        self.logger.addHandler(stream_handler)
+
+    def info(self, message):
+        self.logger.info(message)
+
+    def warning(self, message):
+        self.logger.warning(message)
+
+    def error(self, message):
+        self.logger.error(message)
+
+    def critical(self, message):
+        self.logger.critical(message)
+
+    def debug(self, message):
+        self.logger.debug(message)
+
+
+GLOBAL_LOGGER = None
+
+
+def get_logger():
+    global GLOBAL_LOGGER
+    if GLOBAL_LOGGER is None:
+        level = os.getenv("TEFL_LOG_LEVEL", "INFO").upper()
+        GLOBAL_LOGGER = Logger("TE-FL", level)
+    return GLOBAL_LOGGER
diff --git a/transformer_engine/plugins/module/_common.py b/transformer_engine/plugins/module/_common.py
new file mode 100644
index 0000000000..9c1a70c796
--- /dev/null
+++ b/transformer_engine/plugins/module/_common.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from ..import_utils import safety_import
+
+### RMSNORM
+rmsnorm_fwd_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'rmsnorm_fwd_fl')
+
+def apply_normalization_fl(
+    inputmat: torch.Tensor,
+    ln_out: torch.Tensor,
+    ln_weight: torch.Tensor,
+    ln_bias: Union[torch.Tensor, None],
+    eps: float,
+    output_quantizer,
+    output_dtype,
+    normalization: str,
+    fwd_ln_sm_margin: int,
+    zero_centered_gamma: bool,
+):
+    normalization_func = rmsnorm_fwd_fl
+    return normalization_func(
+        inputmat,
+        ln_weight,
+        eps,
+        ln_out,
+        output_quantizer,
+        output_dtype,
+        fwd_ln_sm_margin,
+        zero_centered_gamma,
+    )
diff --git a/transformer_engine/plugins/register.py b/transformer_engine/plugins/register.py
new file mode 100644
index 0000000000..b92e8617ee
--- /dev/null
+++ b/transformer_engine/plugins/register.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Backend registry for managing multiple backend implementations."""
+import os
+from typing import Any, Dict, Optional
+
+from .logger import get_logger
+logger = get_logger()
+
+
+class Backend:
+    """
+    A backend that can register and provide implementations for various operations.
+    
+    Each backend can register its own implementations for operations like gemm,
+    rmsnorm_fwd, etc. If an operation is not registered, it will fallback to
+    the native backend.
+    
+    Usage:
+        backend = Backend("my_backend")
+        backend.register("gemm", my_gemm_function)
+        backend.register("rmsnorm_fwd", my_rmsnorm_fwd)
+        
+        # Use the backend
+        result = backend.gemm(...)
+    """
+    
+    def __init__(self, name: str):
+        """
+        Initialize a backend.
+        
+        Args:
+            name: Name of the backend (e.g., "native", "te_fl", "custom")
+        """
+        self.name = name
+        self._implementations: Dict[str, Any] = {}
+    
+    def register(self, operation: str, implementation: Any) -> None:
+        """
+        Register an implementation for an operation.
+        
+        Args:
+            operation: Name of the operation (e.g., "gemm", "rmsnorm_fwd")
+            implementation: Function or class to register
+        """
+        self._implementations[operation] = implementation
+        logger.info(f"Backend '{self.name}' registered implementation for '{operation}'")
+    
+    def has(self, operation: str) -> bool:
+        """Check if this backend has an implementation for the operation."""
+        return operation in self._implementations
+    
+    def get(self, operation: str, default: Optional[Any] = None) -> Optional[Any]:
+        """Get the implementation for an operation, or return default if not found."""
+        return self._implementations.get(operation, default)
+    
+    def __getattr__(self, operation: str) -> Any:
+        """
+        Allow accessing operations as attributes (e.g., backend.gemm).
+        Returns the registered implementation if available.
+        """
+        if operation.startswith("_") or operation in ("name", "register", "has", "get"):
+            return super().__getattribute__(operation)
+        
+        if operation in self._implementations:
+            return self._implementations[operation]
+        
+        raise AttributeError(
+            f"Backend '{self.name}' does not have implementation for '{operation}'. "
+            f"Available operations: {list(self._implementations.keys())}"
+        )
+
+
+def get_selected_backend() -> Backend:
+    """
+    Get the selected backend instance based on global environment variable.
+    No longer depends on operation-specific flags.
+    
+    Returns:
+        Backend instance to use
+    """
+    global_flag = os.environ.get("USE_TRANSFORMER_ENGINE_FL", "0")
+    if global_flag.lower() in ("1", "true", "yes", "on"):
+        backend_name = "te_fl"
+    else:
+        backend_name = "native"
+    return get_backend(backend_name)
+
+
+# Global backends registry
+_backends: Dict[str, Backend] = {}
+
+
+def get_backend(name: str) -> Backend:
+    """
+    Get a backend by name. Creates it if it doesn't exist.
+    
+    Args:
+        name: Name of the backend
+    
+    Returns:
+        Backend instance
+    """
+    if name not in _backends:
+        _backends[name] = Backend(name)
+    return _backends[name]
+
+
+def register_backend(backend_name: str, implementations: Dict[str, Any]):
+    """
+    Register backend implementations.
+    
+    Args:
+        backend_name: Name of the backend (e.g., "native", "te_fl", "custom")
+        implementations: Dictionary mapping operation names to their implementations.
+                        Example: {"gemm": native_gemm, "flash_attention": native_flash_attn}
+    
+    Usage:
+        # Register native backend
+        register_backend("native", {
+            "gemm": gemm_native,
+            "rmsnorm_fwd": rmsnorm_fwd_native,
+            "flash_attention": flash_attn_native,
+        })
+        
+        # Register TE-FL backend
+        register_backend("te_fl", {
+            "gemm": gemm_fl,
+            "rmsnorm_fwd": rmsnorm_fwd_fl,
+            "flash_attention": flash_attn_fl,
+        })
+        
+        # Register custom backend
+        register_backend("custom", {
+            "gemm": custom_gemm,
+            "custom_op": custom_function,
+        })
+    """
+    backend = get_backend(backend_name)
+    
+    for operation, implementation in implementations.items():
+        backend.register(operation, implementation)
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 0d1c0b0c05..70cba8444b 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -61,6 +61,7 @@
     FlashAttention,
 )
 
+from transformer_engine.plugins.backend import backend
 
 # Setup Attention Logging
 attn_log.setup_logging()
@@ -422,7 +423,7 @@ def __init__(
             "attention_dropout_ctx": attention_dropout_ctx,
         }
 
-        self.flash_attention = FlashAttention(
+        self.flash_attention = backend.flash_attention(
             softmax_scale,
             attention_type=attention_type,
             layer_number=layer_number,
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 05f2e9cde4..c660f422ad 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -75,6 +75,8 @@
     general_gemm,
 )
 
+from transformer_engine.plugins.backend import backend
+
 __all__ = ["LayerNormLinear"]
 
 
@@ -205,7 +207,7 @@ def forward(
 
         # Apply normalization
         nvtx_range_push(f"{nvtx_label}.norm")
-        ln_out, mu, rsigma = apply_normalization(
+        ln_out, mu, rsigma = backend.apply_normalization(
             inputmat,
             None,  # ln_out
             ln_weight,
@@ -341,7 +343,7 @@ def forward(
         # Note: y = x * w^T
         # ------------------------------------------------------
         nvtx_range_push(f"{nvtx_label}.gemm")
-        gemm_out, *_, reduce_scatter_out = general_gemm(
+        gemm_out, *_, reduce_scatter_out = backend.gemm(
             weightmat,
             ln_out_total,
             get_workspace(),
@@ -507,6 +509,7 @@ def forward(
                     FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module
             ctx.wgrad_store = wgrad_store
             ctx.debug = debug
+            ctx.eps = eps
 
         # ------------------------------------------------------
         # Cached state for backward pass is ready...
@@ -714,7 +717,7 @@ def backward(
             # dgrad GEMM
             # Note: dx = dy * w
             nvtx_range_push(f"{nvtx_label}.dgrad_gemm")
-            gemm_out, *_, reduce_scatter_out = general_gemm(
+            gemm_out, *_, reduce_scatter_out = backend.gemm(
                 weight,
                 grad_output,
                 get_workspace(),
@@ -878,7 +881,7 @@ def wgrad_gemm(
 
                     """
                     nvtx_range_push(f"{nvtx_label}.wgrad_gemm")
-                    dw, db, *_ = general_gemm(x, dy, **wgrad_gemm_kwargs)
+                    dw, db, *_ = backend.gemm(x, dy, **wgrad_gemm_kwargs)
                     nvtx_range_pop(f"{nvtx_label}.wgrad_gemm")
                     return dw, db
 
@@ -963,13 +966,14 @@ def wgrad_gemm(
                 )
                 dgrad = dgrad.reshape(inputmat.size())
             elif ctx.normalization == "RMSNorm":
-                dgrad, dgamma = tex.rmsnorm_bwd(
+                dgrad, dgamma = backend.rmsnorm_bwd(
                     dgrad,
                     inputmat,
                     rsigma,
                     ln_weight,
                     ctx.bwd_ln_sm_margin,
                     ctx.zero_centered_gamma,
+                    ctx.eps,
                 )
                 dgrad = dgrad.reshape(inputmat.size())
                 dbeta = None
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index 3069c21d9f..0b715c7a72 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -71,6 +71,8 @@
 from ..cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...debug.pytorch.debug_state import TEDebugState
 
+from transformer_engine.plugins.backend import backend
+
 __all__ = ["Linear"]
 
 
@@ -306,7 +308,7 @@ def forward(
         # Note: y = x * w^T
         # ------------------------------------------------------
         nvtx_range_push(f"{nvtx_label}.gemm")
-        gemm_out, *_, reduce_scatter_out = general_gemm(
+        gemm_out, *_, reduce_scatter_out = backend.gemm(
             weightmat,
             inputmat_total,
             get_workspace(),
@@ -709,7 +711,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 # Note: dx = dy * w
 
                 nvtx_range_push(f"{nvtx_label}.dgrad_gemm")
-                gemm_out, *_, reduce_scatter_out = general_gemm(
+                gemm_out, *_, reduce_scatter_out = backend.gemm(
                     weight_fp8,
                     grad_output,
                     get_workspace(),
@@ -872,7 +874,7 @@ def wgrad_gemm(
 
                     """
                     nvtx_range_push(f"{nvtx_label}.wgrad_gemm")
-                    dw, db, *_ = general_gemm(x, dy, **wgrad_gemm_kwargs)
+                    dw, db, *_ = backend.gemm(x, dy, **wgrad_gemm_kwargs)
                     nvtx_range_pop(f"{nvtx_label}.wgrad_gemm")
                     return dw, db
 
diff --git a/transformer_engine/pytorch/ops/basic/rmsnorm.py b/transformer_engine/pytorch/ops/basic/rmsnorm.py
index 8c3f029747..5054b5ea8c 100644
--- a/transformer_engine/pytorch/ops/basic/rmsnorm.py
+++ b/transformer_engine/pytorch/ops/basic/rmsnorm.py
@@ -26,6 +26,8 @@
 from ..op import BasicOperation, OperationContext
 from .._common import maybe_autocast_dtype, maybe_dequantize
 
+from transformer_engine.plugins.backend import backend
+
 
 class RMSNorm(BasicOperation):
     r"""Root Mean Square Layer Normalization
@@ -184,7 +186,7 @@ def op_forward(
 
         # Compute RMSNorm
         sm_margin = self._sm_margins["forward" if ctx.requires_grad else "inference"]
-        y, _, rstdevs = rmsnorm_fwd(
+        y, _, rstdevs = backend.rmsnorm_fwd(
             x,
             w,
             self.eps,
@@ -224,14 +226,14 @@ def op_backward(
         dy = maybe_dequantize(grad_output.contiguous(), dtype).view(x.size())
         w = maybe_dequantize(self.weight, dtype).view((inner_dim,))
 
-        # Compute RMSNorm backward pass
-        dx, dw = rmsnorm_bwd(
+        dx, dw = backend.rmsnorm_bwd(
             dy,
             x,
             rstdevs,
             w,
             self._sm_margins["backward"],
             self.zero_centered_gamma,
+            self.eps,
         )
 
         # Clear saved tensors if possible
diff --git a/transformer_engine/pytorch/optimizers/__init__.py b/transformer_engine/pytorch/optimizers/__init__.py
index c76f75743d..6d44a8a6e5 100644
--- a/transformer_engine/pytorch/optimizers/__init__.py
+++ b/transformer_engine/pytorch/optimizers/__init__.py
@@ -4,8 +4,6 @@
 
 """Fused optimizers and multi-tensor kernels."""
 from transformer_engine_torch import (
-    multi_tensor_scale,
-    multi_tensor_l2norm,
     multi_tensor_unscale_l2norm,
     multi_tensor_adam,
     multi_tensor_adam_fp8,
@@ -16,3 +14,6 @@
 from .fused_adam import FusedAdam
 from .fused_sgd import FusedSGD
 from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier
+
+from transformer_engine.plugins.cpp_extensions import multi_tensor_l2_norm_fl as multi_tensor_l2norm
+from transformer_engine.plugins.cpp_extensions import multi_tensor_scale_fl as multi_tensor_scale
diff --git a/transformer_engine/pytorch/optimizers/fused_adam.py b/transformer_engine/pytorch/optimizers/fused_adam.py
index 18f7e2031a..10fd480476 100644
--- a/transformer_engine/pytorch/optimizers/fused_adam.py
+++ b/transformer_engine/pytorch/optimizers/fused_adam.py
@@ -15,6 +15,7 @@
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
 from .multi_tensor_apply import multi_tensor_applier
 
+from transformer_engine.plugins.backend import backend
 
 def get_fp8_meta(fp8_tensor):
     """FP8 metadata getter."""
@@ -711,7 +712,7 @@ def apply_multi_tensor_adam(adam_func, tensor_lists, inv_scale=None, out_dtype=N
                             self.multi_tensor_adam_param_remainder, tensor_lists
                         )
                     else:
-                        apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
+                        apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
                 if len(p_fp8_model) > 0:
                     tensor_lists = [
                         g_of_fp8_model,
@@ -731,14 +732,14 @@ def apply_multi_tensor_adam(adam_func, tensor_lists, inv_scale=None, out_dtype=N
                         m_of_f32_model,
                         v_of_f32_model,
                     ]
-                    apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
+                    apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
             else:  # self.master_weights=False and self.capturable=False
                 if len(p_f16_model) > 0:
                     tensor_lists = [g_of_f16_model, p_f16_model, m_of_f16_model, v_of_f16_model]
-                    apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
+                    apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
                 if len(p_f32_model) > 0:
                     tensor_lists = [g_of_f32_model, p_f32_model, m_of_f32_model, v_of_f32_model]
-                    apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
+                    apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
 
             # Scaling
             for name in ["exp_avg", "exp_avg_sq", "master_param"]:

From e13e38a2e6e2f5ef10048002d2f38a8d0f116c74 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Thu, 11 Dec 2025 20:59:19 +0800
Subject: [PATCH 14/59] Fix import bugs (#6)

# Description

Fix import bugs

Fixes # (issue)

## Type of change

- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Change A
- Change B

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes
---
 .../dot_product_attention/backends.py         | 53 ++++---------------
 .../plugins/cpp_extensions/gemm.py            |  3 +-
 2 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/transformer_engine/plugins/attention/dot_product_attention/backends.py b/transformer_engine/plugins/attention/dot_product_attention/backends.py
index 3c9ca43a1e..2da6dee026 100644
--- a/transformer_engine/plugins/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugins/attention/dot_product_attention/backends.py
@@ -13,11 +13,9 @@
 from transformer_engine.pytorch.utils import (
     get_device_compute_capability,
 )
-from transformer_engine.pytorch.utils import (
-    nvtx_range_push,
-    nvtx_range_pop,
-)
-from transformer_engine.pytorch.quantized_tensor import (
+from transformer_engine.pytorch.utils import nvtx_range_push, nvtx_range_pop
+
+from transformer_engine.pytorch.tensor.quantized_tensor import (
     prepare_for_saving,
     restore_from_saved,
 )
@@ -27,16 +25,10 @@
     QKVLayouts,
     dist_group_type,
 )
+
 from transformer_engine.pytorch.distributed import get_distributed_world_size
 from transformer_engine.pytorch.jit import no_torch_dynamo
 from transformer_engine.pytorch.attention.inference import InferenceParams
-from transformer_engine.pytorch.cpu_offload import (
-    is_cpu_offload_enabled,
-    start_offload,
-    mark_activation_offload,
-    NVTE_CPU_OFFLOAD_V1,
-)
-from transformer_engine.pytorch.cpu_offload_v1 import is_current_layer_offloaded
 
 # Import attention utils
 import transformer_engine.pytorch.attention.dot_product_attention.utils as dpa_utils
@@ -79,10 +71,6 @@ def forward(
         nvtx_label = "transformer_engine.AttnFuncFL.forward"
         nvtx_range_push(f"{nvtx_label}")
 
-        if is_cpu_offload_enabled():
-            start_offload(q, k, v, offload_base_tensor=True)
-
-
         # input types are inferred from the real data while output types are controlled by fp8_output
         # fp8_output should be set upstream as (DPA.fp8 and DPA.fp8_meta["recipe"].fp8_mha)
         assert isinstance(k, q.__class__) and isinstance(
@@ -112,8 +100,6 @@ def forward(
         )
         out = out_permuted.permute(2, 0, 1, 3) # [b, n_h, s, h] -> [s, b, n_h, h]
         aux_ctx_tensors = [out_permuted, m]
-        max_logit = None
-
         out_ret = out
         qkvo_tensors = (q_permuted, k_permuted, v_permuted, out_permuted)
 
@@ -123,7 +109,12 @@ def forward(
         # used when some tensors are base tensors and loose the "dtype" attribute
         ctx.nominal_dtype = out_nominal_dtype
 
-        if is_cpu_offload_enabled() and NVTE_CPU_OFFLOAD_V1:
+        from transformer_engine.pytorch.cpu_offload import (
+            CPUOffloadEnabled,
+            mark_activation_offload,
+        )
+
+        if CPUOffloadEnabled:
             tensor_list = [q, k, v, out]
 
             mark_activation_offload(*tensor_list)
@@ -146,29 +137,7 @@ def forward(
         ctx.dropout_p = dropout_p
         ctx.is_causal = is_causal
 
-        if NVTE_CPU_OFFLOAD_V1:
-            # If interleaved tensor is offloaded, reloaded tensor will be
-            # non-interleaved, so we need to modify the QKV layout
-            # for backward
-            if is_current_layer_offloaded() and is_cpu_offload_enabled():
-                reload_layout = ""
-                split_list = qkv_layout.split("_")
-                for split in split_list:
-                    temp_layout = ""
-                    rep_count = 1
-                    for s in split:
-                        if s.isalpha():
-                            temp_layout = temp_layout + s
-                        else:
-                            rep_count = int(s)
-                    for _ in range(rep_count):
-                        reload_layout = reload_layout + temp_layout + "_"
-                ctx.qkv_layout = reload_layout[:-1]
-            else:
-                ctx.qkv_layout = qkv_layout
-        else:
-            ctx.qkv_layout = qkv_layout
-
+        ctx.qkv_layout = qkv_layout
         ctx.attn_mask_type = attn_mask_type
         ctx.window_size = window_size
         ctx.deterministic = deterministic
diff --git a/transformer_engine/plugins/cpp_extensions/gemm.py b/transformer_engine/plugins/cpp_extensions/gemm.py
index e0310dd902..50f150e3db 100644
--- a/transformer_engine/plugins/cpp_extensions/gemm.py
+++ b/transformer_engine/plugins/cpp_extensions/gemm.py
@@ -9,7 +9,7 @@
 import transformer_engine_torch as tex
 from transformer_engine.pytorch.constants import TE_DType
 
-from transformer_engine.pytorch.quantized_tensor import Quantizer
+from transformer_engine.pytorch.tensor.quantized_tensor import Quantizer
 
 from ..import_utils import have_flag_gems
 
@@ -34,6 +34,7 @@ def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
 def general_gemm_fl(
     A: torch.Tensor,
     B: torch.Tensor,
+    workspace: torch.Tensor,
     out_dtype: Optional[torch.dtype] = None,
     quantization_params: Optional[Quantizer] = None,
     gelu: bool = False,

From ef41367b28c7bf6c207ab8667c2dcb9f0acf9011 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Wed, 17 Dec 2025 15:27:32 +0800
Subject: [PATCH 15/59] Fix flash-attention fallback failures (#7)

# Description

Please include a brief summary of the changes, relevant motivation and
context.

Fixes # (issue)

## Type of change

- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Change A
- Change B

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes
---
 .../dot_product_attention/backends.py         |  7 ++--
 transformer_engine/plugins/backend.py         | 34 ++++++++++++++++---
 .../plugins/cpp_extensions/gemm.py            |  3 ++
 transformer_engine/plugins/module/_common.py  |  2 ++
 .../dot_product_attention.py                  |  5 +--
 5 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/transformer_engine/plugins/attention/dot_product_attention/backends.py b/transformer_engine/plugins/attention/dot_product_attention/backends.py
index 2da6dee026..8c7ae47864 100644
--- a/transformer_engine/plugins/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugins/attention/dot_product_attention/backends.py
@@ -291,9 +291,12 @@ def forward(
         inference_params: Optional[InferenceParams] = None,
         flash_attention_backend: Optional[PkgVersion] = PkgVersion("0"),
         fp8_output: bool = False,
-        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
-        assert HAVE_FLAG_GEMS, "GEMS is not installed"
+        assert HAVE_FLAG_GEMS, "FlagGems is not installed"
+        assert window_size == (-1, 0), "Triton-Based FlashAttention do not support sliding windows now"
+        assert not fp8, "Triton-Based FlashAttention do not support fp8 now"
+        assert attn_mask_type == "causal", "Triton-Based FlashAttention do not support padding mask now"
+
         assert all(
             x.dtype in [torch.float16, torch.bfloat16] or isinstance(x, Float8Tensor)
             for x in [query_layer, key_layer, value_layer]
diff --git a/transformer_engine/plugins/backend.py b/transformer_engine/plugins/backend.py
index 812093f86c..6a3e9a589a 100644
--- a/transformer_engine/plugins/backend.py
+++ b/transformer_engine/plugins/backend.py
@@ -69,6 +69,20 @@ def _get_impl(self, operation: str):
         
         return impl
     
+    def _reset_cache_to_native(self, operation: str):
+        # Check cache first
+        if operation in self._impl_cache:
+            # Get native backend
+            native_backend = get_backend("native")
+            impl = native_backend.get(operation)
+            if impl is None:
+                raise RuntimeError(
+                    f"Operation '{operation}' is not registered in native backend. "
+                    f"Available operations: {sorted(native_backend._implementations.keys())}"
+                )
+            # Cache the implementation for future use
+            self._impl_cache[operation] = impl
+
     def clear_cache(self):
         """Clear the implementation cache. Useful if flags change at runtime."""
         self._impl_cache.clear()
@@ -81,6 +95,7 @@ def gemm(self, *args, **kwargs):
             return impl(*args, **kwargs)
         except Exception as e:
             logger.warning(f"GEMM implementation failed, falling back to native: {e}")
+            self._reset_cache_to_native("gemm")
             native_backend = get_backend("native")
             return native_backend.get("gemm")(*args, **kwargs)
     
@@ -91,6 +106,7 @@ def apply_normalization(self, *args, **kwargs):
             return impl(*args, **kwargs)
         except Exception as e:
             logger.warning(f"Apply Normalization implementation failed, falling back to native: {e}")
+            self._reset_cache_to_native("apply_normalization")
             native_backend = get_backend("native")
             return native_backend.get("apply_normalization")(*args, **kwargs)
     
@@ -101,6 +117,7 @@ def rmsnorm_fwd(self, *args, **kwargs):
             return impl(*args, **kwargs)
         except Exception as e:
             logger.warning(f"RmsNorm FWD implementation failed, falling back to native: {e}")
+            self._reset_cache_to_native("rmsnorm_fwd")
             native_backend = get_backend("native")
             return native_backend.get("rmsnorm_fwd")(*args, **kwargs)
     
@@ -111,6 +128,7 @@ def rmsnorm_bwd(self, *args, **kwargs):
             return impl(*args, **kwargs)
         except Exception as e:
             logger.warning(f"RmsNorm BWD implementation failed, falling back to native: {e}")
+            self._reset_cache_to_native("rmsnorm_bwd")
             native_backend = get_backend("native")
             trimmed_args = args[:-1]  # cut eps
             return native_backend.get("rmsnorm_bwd")(*trimmed_args, **kwargs)
@@ -122,18 +140,24 @@ def multi_tensor_adam(self):
             return impl
         except Exception as e:
             logger.warning(f"Adam implementation failed, falling back to native: {e}")
+            self._reset_cache_to_native("adam")
             native_backend = get_backend("native")
             return native_backend.get("adam")
     
     def flash_attention(self, *args, **kwargs):
         """Flash Attention with automatic fallback to native."""
-        impl = self._get_impl("flash_attention")
+        flash_attention_instance = args[0]
+        trimmed_args = args[1:]
+        native_impl = get_backend("native").get("flash_attention")
         try:
-            return impl(*args, **kwargs)
+            selected_impl = self._get_impl("flash_attention")
+            flash_attention_instance.forward = selected_impl.forward.__get__(flash_attention_instance, native_impl)
+            return flash_attention_instance(*trimmed_args, **kwargs)
         except Exception as e:
-            logger.warning(f"Flash Attention implementation failed, falling back to native: {e}")
-            native_backend = get_backend("native")
-            return native_backend.get("flash_attention")(*args, **kwargs)
+            logger.warning(f"Flash Attention Forward implementation failed, falling back to native: {e}")
+            self._reset_cache_to_native("flash_attention")
+            flash_attention_instance.forward = native_impl.forward.__get__(flash_attention_instance, native_impl)
+            return flash_attention_instance(*trimmed_args, **kwargs)
 
 
 # Backend initialization state
diff --git a/transformer_engine/plugins/cpp_extensions/gemm.py b/transformer_engine/plugins/cpp_extensions/gemm.py
index 50f150e3db..bceff8bc63 100644
--- a/transformer_engine/plugins/cpp_extensions/gemm.py
+++ b/transformer_engine/plugins/cpp_extensions/gemm.py
@@ -59,6 +59,9 @@ def general_gemm_fl(
     assert quantization_params is None, "Triton-Based General Gemm do not support quantization now"
     assert bias is None, "Triton-Based General Gemm do not support bias now"
     assert layout in ("TN", "NN", "NT"), f"GEMM layout {layout} not supported."
+    assert alpha == 1.0 and beta is None, "Triton-Based General Gemm do not support scaling with alpha and beta"
+    if accumulate:
+        assert out is not None, "When accumulate is True, 'out' must be provided"
 
     transa = layout[0] == "T"
     transb = layout[1] == "T"
diff --git a/transformer_engine/plugins/module/_common.py b/transformer_engine/plugins/module/_common.py
index 9c1a70c796..ac2cbfdf9b 100644
--- a/transformer_engine/plugins/module/_common.py
+++ b/transformer_engine/plugins/module/_common.py
@@ -23,6 +23,8 @@ def apply_normalization_fl(
     fwd_ln_sm_margin: int,
     zero_centered_gamma: bool,
 ):
+    assert normalization == "RMSNorm", "Triton-based LayerNorm is not supported in TE-FL"
+    assert ln_bias is None, "Triton-Based RMSNorm do not support bias"
     normalization_func = rmsnorm_fwd_fl
     return normalization_func(
         inputmat,
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 70cba8444b..2d3fea8754 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -423,7 +423,7 @@ def __init__(
             "attention_dropout_ctx": attention_dropout_ctx,
         }
 
-        self.flash_attention = backend.flash_attention(
+        self.flash_attention = FlashAttention(
             softmax_scale,
             attention_type=attention_type,
             layer_number=layer_number,
@@ -1390,7 +1390,8 @@ def forward(
                         max_seqlen_kv,
                         alibi_slopes=alibi_slopes,
                     )
-                return self.flash_attention(
+                return backend.flash_attention(
+                    self.flash_attention,
                     query_layer,
                     key_layer,
                     value_layer,

From fd5f657a04d2f9239fbfbe7fe491e65d701c30a8 Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Mon, 29 Dec 2025 17:45:01 +0800
Subject: [PATCH 16/59] Multi-Backend Architecture Implementation for
 TransformerEngine-FL (#4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# TransformerEngine-FL Plugin System

## Overview

This PR implements a comprehensive multi-backend plugin system for
TransformerEngine-FL, enabling support for multiple hardware vendors
(NVIDIA, AMD, Hygon, etc.) while maintaining full API compatibility with
the original `transformer_engine_torch`.

**Core Philosophy**: A plugin-based backend system that allows hardware
vendors to easily implement their own operator optimizations while
preserving complete compatibility with the original TransformerEngine
API.

## Key Features

### Full API Compatibility
- Drop-in replacement for `transformer_engine_torch`
- Switch backends via environment variables
- Zero changes required to existing user code

### Multi-Backend Support

| Backend | Description | Implementation |
|---------|-------------|----------------|
| **FlagOS (default)** | Triton-based cross-platform implementation |
`backends/flagos/` |
| **CUDA (vendor)** | Wraps original TransformerEngine C++ extensions |
`backends/vendor/cuda/` |
| **Reference** | Pure PyTorch fallback implementation |
`backends/reference/` |

### Three-Tier Backend Selection

```
┌─────────────────────────────────────────────────────────┐
│  1. TE_FL_PER_OP (Per-operator override)    [Highest]   │
│     Example: TE_FL_PER_OP="rmsnorm_fwd=vendor:cuda"     │
├─────────────────────────────────────────────────────────┤
│  2. TE_FL_PREFER (Global preference)                    │
│     Values: flagos / vendor / reference                │
├─────────────────────────────────────────────────────────┤
│  3. Backend Priority (Intrinsic)            [Lowest]    │
│     Each implementation has a priority value            │
└─────────────────────────────────────────────────────────┘
```

## Architecture

### Directory Structure

```
transformer_engine/plugin/core/
├── __init__.py              # Public API exports
├── types.py                 # Core types: BackendImplKind, OpImpl
├── registry.py              # OpRegistry: stores all implementations
├── manager.py               # OpManager: selects and calls implementations
├── policy.py                # SelectionPolicy: backend selection rules
├── discovery.py             # Plugin auto-discovery (entry_points, env)
├── builtin_ops.py           # Registers all built-in backends
├── ops.py                   # TEFLModule: transformer_engine_torch compatible API
├── logger_manager.py        # Logging utilities
├── _module_setup.py         # Module aliasing setup
├── _build_config.py         # Build-time configuration
│
└── backends/
    ├── flagos/              # FlagOS backend (Triton-based)
    │   ├── flagos.py        # FlagOSBackend class
    │   ├── register_ops.py  # Operator registration
    │   └── impl/            # Operator implementations
    │       ├── rmsnorm.py
    │       ├── gemm.py
    │       └── ...
    │
    ├── vendor/              # Vendor backends
    │   └── cuda/            # NVIDIA CUDA backend
    │       ├── cuda.py      # CUDABackend class
    │       └── register_ops.py
    │
    └── reference/           # Reference backend (PyTorch)
        ├── reference.py     # ReferenceBackend class
        ├── register_ops.py
        └── impl/            # Pure PyTorch implementations
```

### Core Components

| File | Description |
|------|-------------|
| `types.py` | Defines `BackendImplKind` (DEFAULT/VENDOR/REFERENCE) and
`OpImpl` dataclass |
| `registry.py` | `OpRegistry` - Central storage for all operator
implementations |
| `manager.py` | `OpManager` - Handles implementation selection,
fallback, and execution |
| `policy.py` | `SelectionPolicy` - Configurable rules for backend
selection |
| `discovery.py` | Auto-discovers plugins via `entry_points` or
`TE_FL_PLUGIN_MODULES` |
| `ops.py` | `TEFLModule` - Provides `transformer_engine_torch`
compatible interface |

## Installation

### Build with CUDA support
```bash
pip install --no-build-isolation -e .
```

### Build without CUDA (FlagOS only)
```bash
TE_FL_SKIP_CUDA=1 pip install --no-build-isolation -e .
```

## Environment Variables

### Backend Selection

| Variable | Description | Values | Default |
|----------|-------------|--------|---------|
| `TE_FL_PREFER` | Preferred backend type | `flagos` / `vendor` /
`reference` | `flagos` |
| `TE_FL_PREFER_VENDOR` | Prefer vendor (legacy) | `1` / `0` | `0` |
| `TE_FL_STRICT` | Strict mode (no fallback) | `1` / `0` | `0` |

### Vendor Filtering

| Variable | Description | Example |
|----------|-------------|---------|
| `TE_FL_ALLOW_VENDORS` | Allowed vendors (whitelist) | `nvidia,amd` |
| `TE_FL_DENY_VENDORS` | Denied vendors (blacklist) | `vendor_a` |

### Per-Operator Configuration

| Variable | Description | Example |
|----------|-------------|---------|
| `TE_FL_PER_OP` | Per-operator backend ordering |
`rmsnorm_fwd=vendor:cuda\|default` |

### Plugin Discovery

| Variable | Description | Example |
|----------|-------------|---------|
| `TE_FL_PLUGIN_MODULES` | Plugin modules to load |
`my_plugin,another_plugin` |

### Build Configuration

| Variable | Description | Values | Default |
|----------|-------------|--------|---------|
| `TE_FL_SKIP_CUDA` | Skip CUDA backend | `1` / `0` | `0` |
| `CUDA_HOME` | CUDA installation path | `/usr/local/cuda` |
Auto-detected |

### Logging

| Variable | Description | Values | Default |
|----------|-------------|--------|---------|
| `TEFL_LOG_LEVEL` | Log level | `DEBUG` / `INFO` / `WARNING` / `ERROR`
| `INFO` |

## Usage Examples

### Basic Usage (No Code Changes Required)
```python
# Existing code works as-is
import transformer_engine.pytorch as te
# or
import transformer_engine_torch as te
```

### Register Custom Backend (In-tree)

```python
from transformer_engine.plugin.core import (
    OpRegistry, OpManager, OpImpl, BackendImplKind
)

# 1. Define implementation
def my_rmsnorm(input, weight, eps=1e-5, **kwargs):
    variance = input.pow(2).mean(-1, keepdim=True)
    return input * torch.rsqrt(variance + eps) * weight, torch.rsqrt(variance + eps)

# 2. Register
registry = OpRegistry()
registry.register_impl(OpImpl(
    op_name="rmsnorm_fwd",
    impl_id="vendor.mybackend",
    kind=BackendImplKind.VENDOR,
    vendor="mybackend",
    fn=my_rmsnorm,
    priority=200,
))

# 3. Call
manager = OpManager(registry)
output, rsigma = manager.call("rmsnorm_fwd", input, weight)
```

### Register Custom Backend (Out-of-tree Plugin)

Create a plugin package with `register(registry)` function:

```python
# my_vendor_plugin/__init__.py
from transformer_engine.plugin.core import OpImpl, BackendImplKind

def my_rmsnorm(input, weight, eps=1e-5, **kwargs):
    # Your implementation
    ...

def register(registry):
    """Called automatically by TE-FL"""
    registry.register_impl(OpImpl(
        op_name="rmsnorm_fwd",
        impl_id="vendor.myvendor",
        kind=BackendImplKind.VENDOR,
        vendor="myvendor",
        fn=my_rmsnorm,
        priority=200,
    ))
```

Load via environment variable:
```bash
export TE_FL_PLUGIN_MODULES=my_vendor_plugin
python your_script.py
```

## Runtime Logs

When running, you'll see logs indicating which backend is used:

```
[TE-FL manager.py:133 INFO] Registered impl_ids: ['default.flagos', 'reference.torch', 'vendor.cuda']
[TE-FL manager.py:390 INFO] Op 'rmsnorm_fwd' using 'default.flagos' (kind=default, vendor=None)
[TE-FL manager.py:395 INFO] Op 'rmsnorm_fwd' switched from 'default.flagos' to 'vendor.cuda' (kind=vendor, vendor=CUDA)
```

## Examples

See `transformer_engine/plugins/examples/` for complete working
examples:
- `example_intree.py` - In-tree backend registration
- `example_outtree.py` - Out-of-tree plugin registration

Fixes # (issue)

## Type of change

- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Change A
- Change B

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes

---------

Co-authored-by: panpy <panpy@sugon.com>
---
 .gitignore                                    |    2 +
 build_tools/pytorch.py                        |    4 +-
 build_tools/utils.py                          |    8 +
 setup.py                                      |   84 +-
 transformer_engine/__init__.py                |    1 +
 transformer_engine/common/__init__.py         |  100 +-
 transformer_engine/plugin/__init__.py         |   23 +
 .../plugin/benchmarks/__init__.py             |    5 +
 .../benchmarks/benchmark_all_backends.py      |  392 +++++
 transformer_engine/plugin/core/__init__.py    |   61 +
 .../plugin/core/_build_config.py.template     |   22 +
 .../plugin/core/_module_setup.py              |   94 ++
 .../plugin/core/backends/__init__.py          |    3 +
 .../plugin/core/backends/flagos/__init__.py   |    7 +
 .../dot_product_attention/backends.py         |  166 +-
 .../plugin/core/backends/flagos/flagos.py     |  156 ++
 .../core/backends/flagos/impl}/__init__.py    |    3 +-
 .../core/backends/flagos/impl/fused_adam.py   |   77 +
 .../plugin/core/backends/flagos/impl/gemm.py  |  113 ++
 .../core/backends/flagos/impl/multi_tensor.py |   26 +
 .../core/backends/flagos/impl/rmsnorm.py      |   63 +
 .../core/backends/flagos/register_ops.py      |   54 +
 .../core/backends/reference/__init__.py       |    7 +
 .../backends/reference/flash_attention.py     |  353 +++++
 .../core/backends/reference/impl/__init__.py  |   90 ++
 .../backends/reference/impl/activation.py     |  286 ++++
 .../core/backends/reference/impl/dropout.py   |   55 +
 .../core/backends/reference/impl/gemm.py      |  128 ++
 .../backends/reference/impl/normalization.py  |   84 ++
 .../core/backends/reference/impl/optimizer.py |  203 +++
 .../core/backends/reference/impl/rmsnorm.py   |   63 +
 .../core/backends/reference/impl/softmax.py   |  134 ++
 .../core/backends/reference/reference.py      |  508 +++++++
 .../core/backends/reference/register_ops.py   |  197 +++
 .../plugin/core/backends/vendor/__init__.py   |   51 +
 .../core/backends/vendor/cuda/__init__.py     |    7 +
 .../plugin/core/backends/vendor/cuda/cuda.py  | 1104 ++++++++++++++
 .../backends/vendor/cuda/flash_attention.py   |  126 ++
 .../core/backends/vendor/cuda/register_ops.py |  202 +++
 transformer_engine/plugin/core/builtin_ops.py |   49 +
 transformer_engine/plugin/core/discovery.py   |  190 +++
 .../plugin/core/logger_manager.py             |  119 ++
 transformer_engine/plugin/core/manager.py     |  478 ++++++
 transformer_engine/plugin/core/ops.py         | 1338 +++++++++++++++++
 transformer_engine/plugin/core/policy.py      |  396 +++++
 transformer_engine/plugin/core/registry.py    |  118 ++
 transformer_engine/plugin/core/types.py       |   65 +
 transformer_engine/plugin/examples/README.md  |  181 +++
 .../plugin/examples/example_intree.py         |   75 +
 .../plugin/examples/example_outtree.py        |  121 ++
 transformer_engine/plugin/test_utils.py       |  214 +++
 transformer_engine/plugin/tests/__init__.py   |    5 +
 .../plugin/tests/run_all_tests.py             |   56 +
 .../plugin/tests/test_activations.py          |  557 +++++++
 .../plugin/tests/test_flash_attention.py      |  328 ++++
 .../plugin/tests/test_normalization.py        |  238 +++
 .../plugin/tests/test_operations.py           |  255 ++++
 .../plugin/tests/test_optimizer.py            |  313 ++++
 .../plugin/tests/test_softmax.py              |  354 +++++
 transformer_engine/plugins/backend.py         |  190 ---
 transformer_engine/plugins/backend_fl.py      |   40 -
 transformer_engine/plugins/backend_native.py  |   43 -
 .../plugins/cpp_extensions/fused_adam.py      |   80 -
 .../plugins/cpp_extensions/gemm.py            |  113 --
 .../cpp_extensions/multi_tensor_apply.py      |   23 -
 .../plugins/cpp_extensions/rmsnorm.py         |   55 -
 transformer_engine/plugins/import_utils.py    |  113 --
 transformer_engine/plugins/logger.py          |   49 -
 transformer_engine/plugins/module/_common.py  |   38 -
 transformer_engine/plugins/register.py        |  144 --
 .../dot_product_attention.py                  |   10 +-
 .../pytorch/module/layernorm_linear.py        |   11 +-
 transformer_engine/pytorch/module/linear.py   |    7 +-
 .../pytorch/ops/basic/rmsnorm.py              |    5 +-
 .../pytorch/optimizers/__init__.py            |    3 -
 .../pytorch/optimizers/fused_adam.py          |    9 +-
 transformer_engine/pytorch/setup.py           |    1 -
 77 files changed, 10395 insertions(+), 1051 deletions(-)
 create mode 100644 transformer_engine/plugin/__init__.py
 create mode 100644 transformer_engine/plugin/benchmarks/__init__.py
 create mode 100644 transformer_engine/plugin/benchmarks/benchmark_all_backends.py
 create mode 100644 transformer_engine/plugin/core/__init__.py
 create mode 100644 transformer_engine/plugin/core/_build_config.py.template
 create mode 100644 transformer_engine/plugin/core/_module_setup.py
 create mode 100644 transformer_engine/plugin/core/backends/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/flagos/__init__.py
 rename transformer_engine/{plugins => plugin/core/backends/flagos}/attention/dot_product_attention/backends.py (71%)
 create mode 100644 transformer_engine/plugin/core/backends/flagos/flagos.py
 rename transformer_engine/{plugins/cpp_extensions => plugin/core/backends/flagos/impl}/__init__.py (68%)
 create mode 100644 transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
 create mode 100644 transformer_engine/plugin/core/backends/flagos/impl/gemm.py
 create mode 100644 transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
 create mode 100644 transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
 create mode 100644 transformer_engine/plugin/core/backends/flagos/register_ops.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/flash_attention.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/activation.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/dropout.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/gemm.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/normalization.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/optimizer.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/impl/softmax.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/reference.py
 create mode 100644 transformer_engine/plugin/core/backends/reference/register_ops.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/cuda/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
 create mode 100644 transformer_engine/plugin/core/builtin_ops.py
 create mode 100644 transformer_engine/plugin/core/discovery.py
 create mode 100644 transformer_engine/plugin/core/logger_manager.py
 create mode 100644 transformer_engine/plugin/core/manager.py
 create mode 100644 transformer_engine/plugin/core/ops.py
 create mode 100644 transformer_engine/plugin/core/policy.py
 create mode 100644 transformer_engine/plugin/core/registry.py
 create mode 100644 transformer_engine/plugin/core/types.py
 create mode 100644 transformer_engine/plugin/examples/README.md
 create mode 100644 transformer_engine/plugin/examples/example_intree.py
 create mode 100644 transformer_engine/plugin/examples/example_outtree.py
 create mode 100644 transformer_engine/plugin/test_utils.py
 create mode 100644 transformer_engine/plugin/tests/__init__.py
 create mode 100644 transformer_engine/plugin/tests/run_all_tests.py
 create mode 100644 transformer_engine/plugin/tests/test_activations.py
 create mode 100644 transformer_engine/plugin/tests/test_flash_attention.py
 create mode 100644 transformer_engine/plugin/tests/test_normalization.py
 create mode 100644 transformer_engine/plugin/tests/test_operations.py
 create mode 100644 transformer_engine/plugin/tests/test_optimizer.py
 create mode 100644 transformer_engine/plugin/tests/test_softmax.py
 delete mode 100644 transformer_engine/plugins/backend.py
 delete mode 100644 transformer_engine/plugins/backend_fl.py
 delete mode 100644 transformer_engine/plugins/backend_native.py
 delete mode 100644 transformer_engine/plugins/cpp_extensions/fused_adam.py
 delete mode 100644 transformer_engine/plugins/cpp_extensions/gemm.py
 delete mode 100644 transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py
 delete mode 100644 transformer_engine/plugins/cpp_extensions/rmsnorm.py
 delete mode 100644 transformer_engine/plugins/import_utils.py
 delete mode 100644 transformer_engine/plugins/logger.py
 delete mode 100644 transformer_engine/plugins/module/_common.py
 delete mode 100644 transformer_engine/plugins/register.py

diff --git a/.gitignore b/.gitignore
index 5da08d3638..1a9a04d72d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,5 @@ compile_commands.json
 .nfs
 tensor_dumps/
 artifacts/
+# Auto-generated build configuration (specific to each environment)
+transformer_engine/plugin/core/_build_config.py
\ No newline at end of file
diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
index 3d44d8740c..e0e65c7cb9 100644
--- a/build_tools/pytorch.py
+++ b/build_tools/pytorch.py
@@ -85,8 +85,10 @@ def setup_pytorch_extension(
     include_dirs = [str(path) for path in include_dirs]
     from torch.utils.cpp_extension import CppExtension
 
+    # Use transformer_engine_torch_nv as the native NVIDIA module name
+    # This allows the plugin system to use transformer_engine_torch as the unified interface
     return CppExtension(
-        name="transformer_engine_torch",
+        name="transformer_engine_torch_nv",
         sources=[str(src) for src in sources],
         include_dirs=[str(inc) for inc in include_dirs],
         extra_compile_args={"cxx": cxx_flags},
diff --git a/build_tools/utils.py b/build_tools/utils.py
index 395b41261b..f453f029e3 100644
--- a/build_tools/utils.py
+++ b/build_tools/utils.py
@@ -251,8 +251,16 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
     ]
 
 
+@functools.lru_cache(maxsize=None)
+def skip_cuda_build() -> bool:
+    """Check if CUDA build should be skipped (for AMD/ROCm or pure FL backend)"""
+    return bool(int(os.getenv("TE_FL_SKIP_CUDA", "0")))
+
+
 @functools.lru_cache(maxsize=None)
 def cuda_archs() -> str:
+    if skip_cuda_build():
+        return ""  # Return empty string when skipping CUDA build
     archs = os.getenv("NVTE_CUDA_ARCHS")
     if archs is None:
         version = cuda_version()
diff --git a/setup.py b/setup.py
index a820265c30..0da2e45abf 100644
--- a/setup.py
+++ b/setup.py
@@ -28,6 +28,9 @@
 
 
 from setuptools.command.build_ext import build_ext as BuildExtension
+from setuptools.command.install import install as InstallCommand
+from datetime import datetime
+import platform
 
 os.environ["NVTE_PROJECT_BUILDING"] = "1"
 
@@ -41,6 +44,63 @@
 archs = cuda_archs()
 
 
+def generate_build_config(skip_cuda_build):
+    """Generate build-time configuration file."""
+    config_template_path = (
+        current_file_path / "transformer_engine" / "plugin" /
+        "core" / "_build_config.py.template"
+    )
+    config_output_path = (
+        current_file_path / "transformer_engine" / "plugin" /
+        "core" / "_build_config.py"
+    )
+
+    if config_template_path.exists():
+        with open(config_template_path, 'r') as f:
+            template = f.read()
+
+        config_content = template.format(
+            skip_cuda=skip_cuda_build,
+            build_time=datetime.now().isoformat(),
+            platform=platform.platform(),
+        )
+
+        with open(config_output_path, 'w') as f:
+            f.write(config_content)
+
+        print(f"Generated build config: {config_output_path}")
+        print(f"  SKIP_CUDA_BUILD = {skip_cuda_build}")
+    else:
+        # Fallback: create minimal config if template doesn't exist
+        config_content = f"""# Auto-generated build configuration
+SKIP_CUDA_BUILD = {skip_cuda_build}
+BUILD_TIME = "{datetime.now().isoformat()}"
+BUILD_PLATFORM = "{platform.platform()}"
+"""
+        with open(config_output_path, 'w') as f:
+            f.write(config_content)
+        print(f"Generated minimal build config: {config_output_path}")
+
+
+class CustomInstall(InstallCommand):
+    """Custom install command to generate build config."""
+
+    user_options = InstallCommand.user_options + [
+        ('skip-cuda-build', None, 'Skip CUDA build'),
+    ]
+
+    def initialize_options(self):
+        super().initialize_options()
+        self.skip_cuda_build = bool(int(os.getenv("TE_FL_SKIP_CUDA", "0")))
+
+    def run(self):
+        # Run the standard install
+        super().run()
+
+        # Generate build config after installation
+        generate_build_config(self.skip_cuda_build)
+
+
 class TimedBdist(bdist_wheel):
     """Helper class to measure build time"""
 
@@ -132,6 +192,14 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
     with open("README.rst", encoding="utf-8") as f:
         long_description = f.read()
 
+    # Check if we should skip CUDA build (for AMD/ROCm or pure FL backend usage)
+    skip_cuda_build = bool(int(os.getenv("TE_FL_SKIP_CUDA", "0")))
+    if skip_cuda_build:
+        print("=" * 60)
+        print("TE_FL_SKIP_CUDA=1: Skipping CUDA/native backend compilation")
+        print("Only FL (Flag-Gems/Triton) backend will be available")
+        print("=" * 60)
+
     # Settings for building top level empty package for dependency management.
     if bool(int(os.getenv("NVTE_BUILD_METAPACKAGE", "0"))):
         assert bool(
@@ -148,6 +216,13 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
             "pytorch": [f"transformer_engine_torch=={__version__}"],
             "jax": [f"transformer_engine_jax=={__version__}"],
         }
+    elif skip_cuda_build:
+        # Skip CUDA build - only install Python packages for FL backend
+        install_requires, test_requires = setup_requirements()
+        ext_modules = []  # No CUDA extensions
+        package_data = {"": ["VERSION.txt"]}
+        include_package_data = True
+        extras_require = {"test": test_requires}
     else:
         install_requires, test_requires = setup_requirements()
         ext_modules = [setup_common_extension()]
@@ -177,6 +252,9 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
                     )
                 )
 
+    # Generate build config before setup
+    generate_build_config(skip_cuda_build)
+
     # Configure package
     setuptools.setup(
         name="transformer_engine",
@@ -193,7 +271,11 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
         long_description=long_description,
         long_description_content_type="text/x-rst",
         ext_modules=ext_modules,
-        cmdclass={"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist},
+        cmdclass={
+            "build_ext": CMakeBuildExtension,
+            "bdist_wheel": TimedBdist,
+            "install": CustomInstall,
+        },
         python_requires=f">={min_python_version_str()}",
         classifiers=["Programming Language :: Python :: 3"],
         install_requires=install_requires,
diff --git a/transformer_engine/__init__.py b/transformer_engine/__init__.py
index e51f03e3d8..c9cbe3b257 100644
--- a/transformer_engine/__init__.py
+++ b/transformer_engine/__init__.py
@@ -8,6 +8,7 @@
 
 import os
 from importlib import metadata
+
 import transformer_engine.common
 
 try:
diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index 5e1318cf86..649674a281 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -18,6 +18,31 @@
 from typing import Optional, Tuple
 
 
+def skip_cuda_build() -> bool:
+    """Check if CUDA build was skipped (FL-only mode).
+
+    First checks environment variable (for runtime override),
+    then falls back to build-time configuration.
+    """
+    # Environment variable takes precedence (allows runtime override)
+    if os.environ.get("TE_FL_SKIP_CUDA"):
+        return bool(int(os.environ.get("TE_FL_SKIP_CUDA", "0")))
+
+    # Fall back to build-time configuration
+    try:
+        from transformer_engine.plugin.core._build_config import SKIP_CUDA_BUILD
+        return SKIP_CUDA_BUILD
+    except ImportError:
+        # If build config doesn't exist, default to False
+        return False
+
+# Load plugin system - this handles module registration and backend initialization
+# The _module_setup inside core will:
+# 1. Register modules under both full and short names for relative imports
+# 2. Load all available backends (flagos, reference, vendor/cuda, etc.)
+# 3. Register transformer_engine_torch module from the selected backend
+import transformer_engine.plugin.core  # noqa: F401
+
 @functools.lru_cache(maxsize=None)
 def _is_package_installed(package) -> bool:
     """Check if the given package is installed via pip."""
@@ -146,46 +171,36 @@ def get_te_core_package_info() -> Tuple[bool, str, str]:
 @functools.lru_cache(maxsize=None)
 def load_framework_extension(framework: str) -> None:
     """
-    Load shared library with Transformer Engine framework bindings
-    and check verify correctness if installed via PyPI.
+    Load shared library with Transformer Engine framework bindings.
+
+    For PyTorch: The native module is now named transformer_engine_torch_nv,
+    and transformer_engine_torch is provided by the plugin system.
+    This function is kept for backward compatibility but does nothing for torch.
     """
 
+    # Skip loading native extensions if CUDA build was skipped (FL-only mode)
+    if skip_cuda_build():
+        return
+
     # Supported frameworks.
     assert framework in ("jax", "torch"), f"Unsupported framework {framework}"
 
-    # Name of the framework extension library.
+    # For torch: plugin system already handles transformer_engine_torch
+    # The native module is transformer_engine_torch_nv (imported by NVIDIA backend)
+    if framework == "torch":
+        return  # Nothing to do, plugin system handles this
+
+    # For jax: load the native module as before
     module_name = f"transformer_engine_{framework}"
 
-    # Name of the pip extra dependency for framework extensions from PyPI.
-    extra_dep_name = module_name
-    if framework == "torch":
-        extra_dep_name = "pytorch"
+    # Skip if already loaded
+    if module_name in sys.modules:
+        return
 
-    # Find the TE packages. The core and framework packages can only be installed via PyPI.
-    # For the `transformer-engine` package, we need to check explicity.
-    te_core_installed, te_core_package_name, te_core_version = get_te_core_package_info()
-    te_framework_installed = _is_package_installed(module_name)
     te_installed = _is_package_installed("transformer_engine")
-    te_installed_via_pypi = _is_package_installed_from_wheel("transformer_engine")
-
     assert te_installed, "Could not find `transformer_engine`."
 
-    # If the framework extension pip package is installed, it means that TE is installed via
-    # PyPI. For this case we need to make sure that the metapackage, the core lib, and framework
-    # extension are all installed via PyPI and have matching versions.
-    if te_framework_installed:
-        assert te_installed_via_pypi, "Could not find `transformer-engine` PyPI package."
-        assert te_core_installed, "Could not find TE core package `transformer-engine-cu*`."
-
-        assert version(module_name) == version("transformer-engine") == te_core_version, (
-            "Transformer Engine package version mismatch. Found"
-            f" {module_name} v{version(module_name)}, transformer-engine"
-            f" v{version('transformer-engine')}, and {te_core_package_name}"
-            f" v{te_core_version}. Install transformer-engine using "
-            f"'pip3 install --no-build-isolation transformer-engine[{extra_dep_name}]==VERSION'"
-        )
-
-    # After all checks are completed, load the shared object file.
+    # Load the shared object file for jax
     spec = importlib.util.spec_from_file_location(module_name, _get_shared_object_file(framework))
     solib = importlib.util.module_from_spec(spec)
     sys.modules[module_name] = solib
@@ -195,6 +210,10 @@ def load_framework_extension(framework: str) -> None:
 def sanity_checks_for_pypi_installation() -> None:
     """Ensure that package is installed correctly if using PyPI."""
 
+    # Skip sanity checks if CUDA build was skipped (FL-only mode)
+    if skip_cuda_build():
+        return
+
     te_core_installed, te_core_package_name, te_core_version = get_te_core_package_info()
     te_installed = _is_package_installed("transformer_engine")
     te_installed_via_pypi = _is_package_installed_from_wheel("transformer_engine")
@@ -390,13 +409,16 @@ def _load_core_library():
 
 if "NVTE_PROJECT_BUILDING" not in os.environ or bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
     sanity_checks_for_pypi_installation()
-    _CUDNN_LIB_CTYPES = _load_cudnn()
-    _NVRTC_LIB_CTYPES = _load_nvrtc()
-    _CURAND_LIB_CTYPES = _load_curand()
-    _CUBLAS_LIB_CTYPES = _load_nvidia_cuda_library("cublas")
-    _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
-    _TE_LIB_CTYPES = _load_core_library()
-
-    # Needed to find the correct headers for NVRTC kernels.
-    if not os.getenv("NVTE_CUDA_INCLUDE_DIR") and _nvidia_cudart_include_dir():
-        os.environ["NVTE_CUDA_INCLUDE_DIR"] = _nvidia_cudart_include_dir()
+
+    # Skip loading CUDA libraries if CUDA build was skipped (FL-only mode)
+    if not skip_cuda_build():
+        _CUDNN_LIB_CTYPES = _load_cudnn()
+        _NVRTC_LIB_CTYPES = _load_nvrtc()
+        _CURAND_LIB_CTYPES = _load_curand()
+        _CUBLAS_LIB_CTYPES = _load_nvidia_cuda_library("cublas")
+        _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
+        _TE_LIB_CTYPES = _load_core_library()
+
+        # Needed to find the correct headers for NVRTC kernels.
+        if not os.getenv("NVTE_CUDA_INCLUDE_DIR") and _nvidia_cudart_include_dir():
+            os.environ["NVTE_CUDA_INCLUDE_DIR"] = _nvidia_cudart_include_dir()
diff --git a/transformer_engine/plugin/__init__.py b/transformer_engine/plugin/__init__.py
new file mode 100644
index 0000000000..478f9256b2
--- /dev/null
+++ b/transformer_engine/plugin/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .core import (
+    TEFLBackendBase,
+    TEFLModule,
+    get_tefl_module as _get_tefl_module,
+    get_registry,
+)
+
+def __getattr__(name):
+    if name == "tefl":
+        return _get_tefl_module()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+__all__ = [
+    "TEFLBackendBase",
+    "TEFLModule",
+    "get_tefl_module",
+    "get_registry",
+    "tefl",
+]
diff --git a/transformer_engine/plugin/benchmarks/__init__.py b/transformer_engine/plugin/benchmarks/__init__.py
new file mode 100644
index 0000000000..caaec47482
--- /dev/null
+++ b/transformer_engine/plugin/benchmarks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+__all__ = []
diff --git a/transformer_engine/plugin/benchmarks/benchmark_all_backends.py b/transformer_engine/plugin/benchmarks/benchmark_all_backends.py
new file mode 100644
index 0000000000..fe03096551
--- /dev/null
+++ b/transformer_engine/plugin/benchmarks/benchmark_all_backends.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+import os
+import sys
+import torch
+import time
+import numpy as np
+from datetime import datetime
+from typing import Dict, List
+
+
+from transformer_engine.plugin.test_utils import get_available_backends, get_backend
+
+
+class BenchmarkResult:
+    def __init__(self, backend_name: str, operation_name: str, shape: tuple,
+                 mean_time: float, std_time: float, min_time: float, max_time: float,
+                 gflops: float = None, bandwidth: float = None):
+        self.backend_name = backend_name
+        self.operation_name = operation_name
+        self.shape = shape
+        self.mean_time = mean_time
+        self.std_time = std_time
+        self.min_time = min_time
+        self.max_time = max_time
+        self.gflops = gflops
+        self.bandwidth = bandwidth
+
+    def __str__(self):
+        gflops_str = f"{self.gflops:.2f} GFLOPS" if self.gflops else "N/A"
+        bandwidth_str = f"{self.bandwidth:.2f} GB/s" if self.bandwidth else "N/A"
+        return (f"{self.backend_name:12s} {self.mean_time:8.4f}±{self.std_time:6.4f} ms  "
+                f"[{self.min_time:7.4f}, {self.max_time:7.4f}]  "
+                f"{gflops_str:15s} {bandwidth_str:12s}")
+
+
+def time_operation(func, warmup_iters=10, benchmark_iters=100):
+    for _ in range(warmup_iters):
+        func()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+    times = []
+    for _ in range(benchmark_iters):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        start = time.perf_counter()
+        func()
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        end = time.perf_counter()
+        times.append((end - start) * 1000)
+
+    return {
+        'mean': np.mean(times),
+        'std': np.std(times),
+        'min': np.min(times),
+        'max': np.max(times),
+    }
+
+
+def compute_gflops(operation: str, shape: tuple, time_ms: float) -> float:
+    if operation in ['gelu', 'relu', 'silu']:
+        flops = np.prod(shape) * 5
+    elif operation == 'layernorm':
+        total_elements = np.prod(shape)
+        hidden_size = shape[-1]
+        flops = total_elements * (3 + 2 * hidden_size)
+    elif operation == 'rmsnorm':
+        total_elements = np.prod(shape)
+        hidden_size = shape[-1]
+        flops = total_elements * (2 + hidden_size)
+    elif operation == 'gemm':
+        M, N, K = shape
+        flops = 2 * M * N * K
+    else:
+        return None
+
+    return (flops / 1e9) / (time_ms / 1000)
+
+
+def compute_bandwidth(operation: str, shape: tuple, time_ms: float) -> float:
+    bytes_per_element = 4
+
+    if operation in ['gelu', 'relu', 'silu']:
+        total_bytes = np.prod(shape) * 2 * bytes_per_element
+    elif operation in ['layernorm', 'rmsnorm']:
+        total_bytes = np.prod(shape) * 5 * bytes_per_element
+    elif operation == 'gemm':
+        M, N, K = shape
+        total_bytes = (M*K + K*N + M*N) * bytes_per_element
+    else:
+        return None
+
+    return (total_bytes / 1e9) / (time_ms / 1000)
+
+
+def benchmark_activations(backends: List[str], shapes: List[tuple], device: str) -> List[BenchmarkResult]:
+    print("\n" + "="*80)
+    print("Activation Function Performance Test")
+    print("="*80)
+
+    results = []
+    operations = [
+        ('gelu', 'GELU'),
+        ('relu', 'ReLU'),
+        ('silu', 'SiLU'),
+    ]
+
+    for shape in shapes:
+        print(f"\nShape: {shape}")
+        x = torch.randn(shape, dtype=torch.float32, device=device)
+
+        for op_method, op_name in operations:
+            print(f"\n  {op_name}:")
+            print(f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+            print(f"    {'-'*85}")
+
+            for backend_name in backends:
+                backend = get_backend(backend_name)
+
+                try:
+                    func = lambda: getattr(backend, op_method)(x, None)
+                    timing = time_operation(func)
+
+                    gflops = compute_gflops(op_method, shape, timing['mean'])
+                    bandwidth = compute_bandwidth(op_method, shape, timing['mean'])
+
+                    result = BenchmarkResult(
+                        backend_name, op_method, shape,
+                        timing['mean'], timing['std'], timing['min'], timing['max'],
+                        gflops, bandwidth
+                    )
+                    results.append(result)
+                    print(f"    {result}")
+
+                except Exception as e:
+                    print(f"    {backend_name:12s} SKIPPED ({type(e).__name__}: {str(e)[:40]})")
+
+    return results
+
+
+def benchmark_normalization(backends: List[str], shapes: List[tuple], device: str) -> List[BenchmarkResult]:
+    print("\n" + "="*80)
+    print("Normalization Performance Test")
+    print("="*80)
+
+    results = []
+    eps = 1e-5
+
+    for shape in shapes:
+        print(f"\nShape: {shape}")
+        hidden_size = shape[-1]
+        x = torch.randn(shape, dtype=torch.float32, device=device)
+        weight = torch.ones(hidden_size, dtype=torch.float32, device=device)
+        bias = torch.zeros(hidden_size, dtype=torch.float32, device=device)
+
+        print(f"\n  LayerNorm forward:")
+        print(f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+        print(f"    {'-'*85}")
+
+        for backend_name in backends:
+            backend = get_backend(backend_name)
+
+            try:
+                func = lambda: backend.layernorm_fwd(x, weight, bias, eps, None, None, torch.float32, 0, False)
+                timing = time_operation(func)
+
+                gflops = compute_gflops('layernorm', shape, timing['mean'])
+                bandwidth = compute_bandwidth('layernorm', shape, timing['mean'])
+
+                result = BenchmarkResult(
+                    backend_name, 'layernorm_fwd', shape,
+                    timing['mean'], timing['std'], timing['min'], timing['max'],
+                    gflops, bandwidth
+                )
+                results.append(result)
+                print(f"    {result}")
+
+            except Exception as e:
+                print(f"    {backend_name:12s} SKIPPED ({type(e).__name__})")
+
+        print(f"\n  RMSNorm forward:")
+        print(f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+        print(f"    {'-'*85}")
+
+        for backend_name in backends:
+            backend = get_backend(backend_name)
+
+            try:
+                func = lambda: backend.rmsnorm_fwd(x, weight, eps, None, None, torch.float32, 0, False)
+                timing = time_operation(func)
+
+                gflops = compute_gflops('rmsnorm', shape, timing['mean'])
+                bandwidth = compute_bandwidth('rmsnorm', shape, timing['mean'])
+
+                result = BenchmarkResult(
+                    backend_name, 'rmsnorm_fwd', shape,
+                    timing['mean'], timing['std'], timing['min'], timing['max'],
+                    gflops, bandwidth
+                )
+                results.append(result)
+                print(f"    {result}")
+
+            except Exception as e:
+                print(f"    {backend_name:12s} SKIPPED ({type(e).__name__})")
+
+    return results
+
+
+def benchmark_gemm(backends: List[str], configs: List[tuple], device: str) -> List[BenchmarkResult]:
+    print("\n" + "="*80)
+    print("GEMM Performance Test")
+    print("="*80)
+
+    results = []
+
+    for M, N, K in configs:
+        print(f"\nConfig: M={M}, N={N}, K={K}")
+        print(f"  {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+        print(f"  {'-'*85}")
+
+        A = torch.randn(M, K, dtype=torch.float32, device=device)
+        B = torch.randn(K, N, dtype=torch.float32, device=device)
+        D = torch.empty(M, N, dtype=torch.float32, device=device)
+        workspace = torch.empty(1024, dtype=torch.uint8, device=device)
+
+        for backend_name in backends:
+            backend = get_backend(backend_name)
+
+            try:
+                func = lambda: backend.generic_gemm(
+                    A, False, B, False, D,
+                    None, torch.float32, None, None,
+                    False, None, False,
+                    workspace, 1024, False, False
+                )
+                timing = time_operation(func)
+
+                gflops = compute_gflops('gemm', (M, N, K), timing['mean'])
+                bandwidth = compute_bandwidth('gemm', (M, N, K), timing['mean'])
+
+                result = BenchmarkResult(
+                    backend_name, 'gemm', (M, N, K),
+                    timing['mean'], timing['std'], timing['min'], timing['max'],
+                    gflops, bandwidth
+                )
+                results.append(result)
+                print(f"  {result}")
+
+            except Exception as e:
+                print(f"  {backend_name:12s} SKIPPED ({type(e).__name__})")
+
+    return results
+
+
+def print_summary(all_results: List[BenchmarkResult]):
+    print("\n" + "="*80)
+    print("Performance Comparison Summary")
+    print("="*80)
+
+    from collections import defaultdict
+    by_operation = defaultdict(lambda: defaultdict(list))
+
+    for result in all_results:
+        by_operation[result.operation_name][result.backend_name].append(result)
+
+    print("\nAverage Performance (all shapes):")
+    print(f"{'Operation':<20s} {'Backend':<12s} {'Avg Time (ms)':<15s} {'Avg GFLOPS':<15s}")
+    print("-"*65)
+
+    for op_name, backends_data in sorted(by_operation.items()):
+        for backend_name, results in sorted(backends_data.items()):
+            avg_time = np.mean([r.mean_time for r in results])
+            gflops_list = [r.gflops for r in results if r.gflops is not None]
+            avg_gflops = np.mean(gflops_list) if gflops_list else None
+
+            gflops_str = f"{avg_gflops:.2f}" if avg_gflops else "N/A"
+            print(f"{op_name:<20s} {backend_name:<12s} {avg_time:<15.4f} {gflops_str:<15s}")
+
+    print("\n" + "="*80)
+    print("Fastest Backend (by operation)")
+    print("="*80)
+
+    for op_name, backends_data in sorted(by_operation.items()):
+        backend_avg_times = {}
+        for backend_name, results in backends_data.items():
+            backend_avg_times[backend_name] = np.mean([r.mean_time for r in results])
+
+        if backend_avg_times:
+            fastest = min(backend_avg_times.items(), key=lambda x: x[1])
+            print(f"{op_name:<20s} → {fastest[0]:<12s} ({fastest[1]:.4f} ms)")
+
+
+def save_results_csv(results: List[BenchmarkResult], filename: str):
+    import csv
+
+    with open(filename, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            'Backend', 'Operation', 'Shape', 'Mean(ms)', 'Std(ms)',
+            'Min(ms)', 'Max(ms)', 'GFLOPS', 'GB/s'
+        ])
+
+        for result in results:
+            writer.writerow([
+                result.backend_name,
+                result.operation_name,
+                str(result.shape),
+                f"{result.mean_time:.4f}",
+                f"{result.std_time:.4f}",
+                f"{result.min_time:.4f}",
+                f"{result.max_time:.4f}",
+                f"{result.gflops:.2f}" if result.gflops else "N/A",
+                f"{result.bandwidth:.2f}" if result.bandwidth else "N/A",
+            ])
+
+    print(f"\nResults saved to: {filename}")
+
+
+def main():
+    print("\n" + "="*80)
+    print(" "*25 + "Multi-Backend Performance Comparison Test")
+    print("="*80)
+
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+        print(f"\nDevice: CUDA - {torch.cuda.get_device_name(0)}")
+        print(f"CUDA version: {torch.version.cuda}")
+    else:
+        print(f"\nDevice: CPU")
+    print(f"PyTorch version: {torch.__version__}")
+
+    backends = get_available_backends()
+    print(f"\nAvailable backends: {', '.join(backends)}")
+    print(f"Total: {len(backends)} backends")
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = f"benchmark_results_{timestamp}"
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Results will be saved to: {output_dir}/")
+
+    activation_shapes = [
+        (1024, 1024),
+        (2048, 2048),
+        (4096, 4096),
+    ]
+
+    normalization_shapes = [
+        (8, 512, 768),
+        (16, 512, 1024),
+        (32, 512, 2048),
+    ]
+
+    gemm_configs = [
+        (512, 512, 512),
+        (1024, 1024, 1024),
+        (2048, 2048, 2048),
+    ]
+
+    all_results = []
+
+    results = benchmark_activations(backends, activation_shapes, device)
+    all_results.extend(results)
+    save_results_csv(results, f"{output_dir}/activations.csv")
+
+    results = benchmark_normalization(backends, normalization_shapes, device)
+    all_results.extend(results)
+    save_results_csv(results, f"{output_dir}/normalization.csv")
+
+    results = benchmark_gemm(backends, gemm_configs, device)
+    all_results.extend(results)
+    save_results_csv(results, f"{output_dir}/gemm.csv")
+
+    print_summary(all_results)
+
+    save_results_csv(all_results, f"{output_dir}/all_results.csv")
+
+    print("\n" + "="*80)
+    print("Testing complete!")
+    print("="*80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/core/__init__.py b/transformer_engine/plugin/core/__init__.py
new file mode 100644
index 0000000000..a4d4b2a139
--- /dev/null
+++ b/transformer_engine/plugin/core/__init__.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .types import BackendImplKind, OpImpl, match_token
+
+from .ops import (
+    TEFLBackendBase,
+    TEFLModule,
+    get_tefl_module,
+    reset_tefl_module,
+    get_registry,
+    get_manager,
+    reset_registry,
+)
+
+from .logger_manager import Logger, LoggerManager
+from .policy import (
+    SelectionPolicy,
+    PolicyManager,
+    get_policy,
+    set_global_policy,
+    reset_global_policy,
+    policy_context,
+    policy_from_env,
+    get_policy_epoch,
+    bump_policy_epoch,
+    with_strict_mode,
+    with_preference,
+    with_allowed_vendors,
+    with_denied_vendors,
+    PREFER_DEFAULT,
+    PREFER_VENDOR,
+    PREFER_REFERENCE,
+    VALID_PREFER_VALUES,
+)
+
+from .manager import OpManager, get_default_manager, reset_default_manager
+from .registry import OpRegistry
+
+
+from .discovery import (
+    discover_plugin,
+    discover_from_entry_points,
+    discover_from_env_modules,
+    get_discovered_plugin,
+    clear_discovered_plugin,
+    PLUGIN_GROUP,
+    PLUGIN_MODULES_ENV,
+)
+
+# Setup module aliases BEFORE importing backends to support relative imports
+from ._module_setup import setup_module_aliases, register_as_transformer_engine_torch
+setup_module_aliases()
+
+# Import backends - this loads all available backends (flagos, reference, vendor/cuda, etc.)
+from . import backends
+
+# Register transformer_engine_torch AFTER backends are loaded
+# so that get_tefl_module() can find a registered backend
+register_as_transformer_engine_torch()
diff --git a/transformer_engine/plugin/core/_build_config.py.template b/transformer_engine/plugin/core/_build_config.py.template
new file mode 100644
index 0000000000..27b90f5080
--- /dev/null
+++ b/transformer_engine/plugin/core/_build_config.py.template
@@ -0,0 +1,22 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Build-time Configuration (Auto-generated)
+
+This file is automatically generated during package installation.
+DO NOT EDIT MANUALLY.
+
+Configuration settings are determined at build time and should not
+be changed at runtime.
+"""
+
+# Whether CUDA backend was skipped during build
+SKIP_CUDA_BUILD = {skip_cuda}
+
+# Build timestamp
+BUILD_TIME = "{build_time}"
+
+# Build platform
+BUILD_PLATFORM = "{platform}"
diff --git a/transformer_engine/plugin/core/_module_setup.py b/transformer_engine/plugin/core/_module_setup.py
new file mode 100644
index 0000000000..20ef221806
--- /dev/null
+++ b/transformer_engine/plugin/core/_module_setup.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Module setup for core plugin system.
+
+This module handles the registration of core modules in sys.modules
+with both full and short names to support relative imports in backends.
+"""
+
+import sys
+from pathlib import Path
+
+
+def setup_module_aliases():
+    """
+    Register core modules under both full and short names.
+
+    This allows backends to use relative imports like:
+        from ...ops import TEFLBackendBase
+        from ...types import OpImpl, BackendImplKind
+
+    And ensures they work correctly regardless of how the module is imported.
+    """
+    # Get the current package
+    current_package = sys.modules.get("transformer_engine.plugin.core")
+    if current_package is None:
+        return
+
+    # Register the main package under short name
+    sys.modules["core"] = current_package
+
+    # List of submodules to register
+    submodule_names = [
+        "ops",
+        "logger",
+        "types",
+        "logger_manager",
+        "policy",
+        "operator_registry",
+        "registry",
+        "discovery",
+    ]
+
+    # Register each submodule under short name
+    for name in submodule_names:
+        full_name = f"transformer_engine.plugin.core.{name}"
+        short_name = f"core.{name}"
+
+        if full_name in sys.modules and short_name not in sys.modules:
+            sys.modules[short_name] = sys.modules[full_name]
+
+    # Register backends package
+    backends_full = "transformer_engine.plugin.core.backends"
+    backends_short = "core.backends"
+    if backends_full in sys.modules and backends_short not in sys.modules:
+        sys.modules[backends_short] = sys.modules[backends_full]
+
+    # Register parent plugin package if needed
+    if "transformer_engine.plugin" not in sys.modules:
+        import types
+        plugin_dir = Path(__file__).parent.parent
+        plugin_pkg = types.ModuleType("transformer_engine.plugin")
+        plugin_pkg.__path__ = [str(plugin_dir)]
+        sys.modules["transformer_engine.plugin"] = plugin_pkg
+
+
+def register_as_transformer_engine_torch():
+    """
+    Register the tefl module as transformer_engine_torch.
+
+    This provides backward compatibility with code that expects
+    transformer_engine_torch to be available.
+    """
+    # Only register if not already present
+    if "transformer_engine_torch" in sys.modules:
+        return
+
+    try:
+        from .ops import get_tefl_module
+        tefl_module = get_tefl_module()
+        sys.modules["transformer_engine_torch"] = tefl_module
+    except Exception as e:
+        import traceback
+        print(f"[TEFL Setup] Warning: Could not register transformer_engine_torch: {e}")
+        traceback.print_exc()
+
+        # Create a minimal placeholder module to avoid import errors
+        # This allows the system to at least import without crashing
+        import types
+        placeholder = types.ModuleType("transformer_engine_torch")
+        placeholder.__doc__ = "Placeholder module - TEFL backend not available"
+        sys.modules["transformer_engine_torch"] = placeholder
diff --git a/transformer_engine/plugin/core/backends/__init__.py b/transformer_engine/plugin/core/backends/__init__.py
new file mode 100644
index 0000000000..88988bab64
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/flagos/__init__.py b/transformer_engine/plugin/core/backends/flagos/__init__.py
new file mode 100644
index 0000000000..86126aa3e0
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .flagos import FlagOSBackend
+
+__all__ = ["FlagOSBackend"]
diff --git a/transformer_engine/plugins/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
similarity index 71%
rename from transformer_engine/plugins/attention/dot_product_attention/backends.py
rename to transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index 8c7ae47864..699767b7be 100644
--- a/transformer_engine/plugins/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -2,7 +2,6 @@
 #
 # See LICENSE for license information.
 
-"""Attention Backends."""
 from contextlib import nullcontext
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -30,20 +29,14 @@
 from transformer_engine.pytorch.jit import no_torch_dynamo
 from transformer_engine.pytorch.attention.inference import InferenceParams
 
-# Import attention utils
 import transformer_engine.pytorch.attention.dot_product_attention.utils as dpa_utils
 
-from ...import_utils import have_flag_gems
-
-HAVE_FLAG_GEMS = have_flag_gems()
-
-if HAVE_FLAG_GEMS:
-    import flag_gems
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+from transformer_engine.plugin.core.logger_manager import print_once
 
+import flag_gems
 
 class AttnFuncFL(torch.autograd.Function):
-    """FusedAttention forward and backward implementation"""
-
     @staticmethod
     def forward(
         ctx,
@@ -66,47 +59,44 @@ def forward(
         deterministic,
         layer_number,
     ):
-        # pylint: disable=missing-function-docstring
-        # add NVTX range
         nvtx_label = "transformer_engine.AttnFuncFL.forward"
         nvtx_range_push(f"{nvtx_label}")
 
-        # input types are inferred from the real data while output types are controlled by fp8_output
-        # fp8_output should be set upstream as (DPA.fp8 and DPA.fp8_meta["recipe"].fp8_mha)
         assert isinstance(k, q.__class__) and isinstance(
             v, q.__class__
         ), "q, k, v must be of the same class, e.g. torch.Tensor or Float8Tensor."
 
-        # get nominal data type for out
-        # FP16/BF16 attention: torch.float16 or torch.bfloat16
-        # FP8 attention:       torch.float16 or torch.bfloat16
         out_nominal_dtype = q.dtype
 
         max_logit = None
 
         is_causal = attn_mask_type == 'causal'
-        q_permuted = q.permute(1, 2, 0, 3) #[s, b, n_h, h] -> [b, n_h, s, h]
-        k_permuted = k.permute(1, 2, 0, 3)
-        v_permuted = v.permute(1, 2, 0, 3)
-        (out_permuted, m) = flag_gems.scaled_dot_product_attention_forward(
-            q_permuted,
-            k_permuted,
-            v_permuted,
-            attn_mask=None,
-            dropout_p=dropout_p,
-            is_causal=is_causal,
-            scale=attn_scale,
-            enable_gqa=True,
-        )
-        out = out_permuted.permute(2, 0, 1, 3) # [b, n_h, s, h] -> [s, b, n_h, h]
+
+        with flag_gems.use_gems():
+            # FlagGems requires contiguous tensors, so we must call contiguous() after permute
+            q_permuted = q.permute(1, 2, 0, 3).contiguous()
+            k_permuted = k.permute(1, 2, 0, 3).contiguous()
+            v_permuted = v.permute(1, 2, 0, 3).contiguous()
+
+            (out_permuted, m) = flag_gems.scaled_dot_product_attention_forward(
+                q_permuted,
+                k_permuted,
+                v_permuted,
+                attn_mask=None,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+                scale=attn_scale,
+                enable_gqa=True,
+            )
+
+            # Must be contiguous for .view() in FlashAttentionFL.forward
+            out = out_permuted.permute(2, 0, 1, 3).contiguous()
         aux_ctx_tensors = [out_permuted, m]
         out_ret = out
         qkvo_tensors = (q_permuted, k_permuted, v_permuted, out_permuted)
 
         nvtx_range_pop(f"{nvtx_label}")
 
-        # assume fwd and bwd always use the same high precision, i.e. torch.float16 or torch.bfloat16
-        # used when some tensors are base tensors and loose the "dtype" attribute
         ctx.nominal_dtype = out_nominal_dtype
 
         from transformer_engine.pytorch.cpu_offload import (
@@ -146,10 +136,6 @@ def forward(
 
     @staticmethod
     def backward(ctx, d_out, *_args):
-        # pylint: disable=missing-function-docstring
-
-        # d_out is expected to be in FP8 if is_output_fp8=True,
-        # but in the case it's not, convert it to FP8 before any operation
         d_out = d_out.contiguous()
         (
             q_permuted,
@@ -171,31 +157,38 @@ def backward(ctx, d_out, *_args):
         rest = [None]
 
         with torch.cuda.nvtx.range("AttnFuncFL.backward"):
-            # get nominal data type of dq, dk, dv
-            # FP16/BF16 attention: torch.float16 or torch.bfloat16
-            # FP8 attention:       torch.float16 or torch.bfloat16
             dqkv_nominal_dtype = ctx.nominal_dtype
 
             dqkv_te_dtype = TE_DType[d_out.dtype]
 
-            q_permuted, k_permuted, v_permuted, m = map(lambda x: x.contiguous() if not x.is_contiguous() else x, (q_permuted, k_permuted, v_permuted, m))
-            d_out_permuted = d_out.permute(1, 2, 0, 3).contiguous() # [s, b, n_h, h] -> [b, n_h, s, h]
-            dq_permuted, dk_permuted, dv_permuted = flag_gems.scaled_dot_product_attention_backward(
-                d_out_permuted,
-                q_permuted,
-                k_permuted,
-                v_permuted,
-                out_permuted,
-                m,
-                attn_mask=None,
-                dropout_p=ctx.dropout_p,
-                is_causal=ctx.is_causal,
-                scale=ctx.attn_scale,
-                enable_gqa=True,
-            )
-            dq = dq_permuted.permute(2, 0, 1, 3)
-            dk = dk_permuted.permute(2, 0, 1, 3)
-            dv = dv_permuted.permute(2, 0, 1, 3)
+            with flag_gems.use_gems():
+                # Ensure all tensors are contiguous for FlagGems backward
+                q_permuted = q_permuted.contiguous() if not q_permuted.is_contiguous() else q_permuted
+                k_permuted = k_permuted.contiguous() if not k_permuted.is_contiguous() else k_permuted
+                v_permuted = v_permuted.contiguous() if not v_permuted.is_contiguous() else v_permuted
+                out_permuted = out_permuted.contiguous() if not out_permuted.is_contiguous() else out_permuted
+                m = m.contiguous() if not m.is_contiguous() else m
+
+                # d_out is (seq, batch, heads, dim) from autograd, permute to (batch, heads, seq, dim)
+                d_out_permuted = d_out.permute(1, 2, 0, 3).contiguous()
+
+                dq_permuted, dk_permuted, dv_permuted = flag_gems.scaled_dot_product_attention_backward(
+                    d_out_permuted,
+                    q_permuted,
+                    k_permuted,
+                    v_permuted,
+                    out_permuted,
+                    m,
+                    attn_mask=None,
+                    dropout_p=ctx.dropout_p,
+                    is_causal=ctx.is_causal,
+                    scale=ctx.attn_scale,
+                    enable_gqa=True,
+                )
+
+                dq = dq_permuted.permute(2, 0, 1, 3)
+                dk = dk_permuted.permute(2, 0, 1, 3)
+                dv = dv_permuted.permute(2, 0, 1, 3)
             rest = None
 
         return (
@@ -220,39 +213,30 @@ def backward(ctx, d_out, *_args):
         )
 
 
-class FlashAttentionFL(torch.nn.Module):
-    """Dot product attention
-    """
-
+class FlashAttentionFL(FlashAttentionBase):
     def __init__(
         self,
         softmax_scale: float,
         attention_dropout: float = 0.0,
-        attention_dropout_ctx: Optional[Callable] = nullcontext,
+        attention_dropout_ctx: Optional[Callable] = None,
         attention_type: str = "self",
         layer_number: Optional[int] = None,
         deterministic: bool = False,
     ) -> None:
-        super().__init__()
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
 
-        self.softmax_scale = softmax_scale
-        self.attention_dropout = attention_dropout
-        self.attention_dropout_ctx = attention_dropout_ctx
-        self.attention_type = attention_type
         self.use_FAv2_bwd = os.getenv(
             "NVTE_FUSED_ATTN_USE_FAv2_BWD", "0"
         ) == "1" and get_device_compute_capability() == (9, 0)
-        self.layer_number = 1 if layer_number is None else layer_number
-        self.deterministic = deterministic
-
-        def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unused-argument
-            """
-            Temporarily remove fused_attention._extra_state as a missing key
-            or an unexpected key when loading Transformer Engine checkpoints.
-            Please store FP8 metadata as DotProductAttention's _extra_state,
-            rather than FusedAttention's _extra_state. This hook will be
-            phased out in Transformer Engine 2.0.
-            """
+
+        def remove_extra_states_check(self, incompatible_keys):
             for key in incompatible_keys.missing_keys:
                 if "fused_attention._extra_state" in key:
                     incompatible_keys.missing_keys.remove(key)
@@ -266,6 +250,10 @@ def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unuse
 
         self.register_load_state_dict_post_hook(remove_extra_states_check)
 
+    @property
+    def backend_name(self) -> str:
+        return "flagos"
+
     @no_torch_dynamo()
     def forward(
         self,
@@ -292,11 +280,6 @@ def forward(
         flash_attention_backend: Optional[PkgVersion] = PkgVersion("0"),
         fp8_output: bool = False,
     ) -> torch.Tensor:
-        assert HAVE_FLAG_GEMS, "FlagGems is not installed"
-        assert window_size == (-1, 0), "Triton-Based FlashAttention do not support sliding windows now"
-        assert not fp8, "Triton-Based FlashAttention do not support fp8 now"
-        assert attn_mask_type == "causal", "Triton-Based FlashAttention do not support padding mask now"
-
         assert all(
             x.dtype in [torch.float16, torch.bfloat16] or isinstance(x, Float8Tensor)
             for x in [query_layer, key_layer, value_layer]
@@ -317,18 +300,14 @@ def forward(
         context_parallel = cp_size > 1
         assert not context_parallel, "FLAttention do not support context parallel now"
 
-        # get q_format and kv_format for training and inference
         qkv_format, q_format, kv_format = dpa_utils.get_qkv_format(qkv_layout, inference_params)
 
-        # cuDNN can work with 0-length sequences in the batch for both bshd/sbhd and thd formats
-        # however, for bshd/sbhd, q/k/v tensors need to have the same batch size as indicated by
-        # cu_seqlens, whereas thd does not have this requirement
-        # e.g. if q_format = bshd, and q.shape = [3, 1, 16, 64], we should have k.shape[0] =
-        # v.shape[0] = q.shape[0], and cu_seqlens_q.shape = cu_seqlens_kv.shape = [4]
         if q_format in ["bshd", "sbhd"] or kv_format in ["bshd", "sbhd"]:
             batch_size = query_layer.shape[0] if q_format == "bshd" else query_layer.shape[1]
-            cu_seqlens_q = cu_seqlens_q[: batch_size + 1]
-            cu_seqlens_kv = cu_seqlens_kv[: batch_size + 1]
+            if cu_seqlens_q is not None:
+                cu_seqlens_q = cu_seqlens_q[: batch_size + 1]
+            if cu_seqlens_kv is not None:
+                cu_seqlens_kv = cu_seqlens_kv[: batch_size + 1]
 
         page_table = None
         if inference_params is None:
@@ -399,10 +378,9 @@ def forward(
                 qkv_layout,
                 attn_mask_type,
                 window_size,
-                None,  # rng_gen
+                None,
                 self.deterministic,
                 self.layer_number,
             )
 
-        # ...hd -> ...(hd)
         return output.view(*output.shape[:-2], -1)
diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
new file mode 100644
index 0000000000..f206d7d7f6
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+
+from ...ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+
+from .impl import (
+    rmsnorm_fwd_fl, rmsnorm_bwd_fl,
+    multi_tensor_scale_fl, multi_tensor_adam_fl,
+    multi_tensor_l2_norm_fl,
+    generic_gemm_fl
+)
+
+def _check_flagos_available() -> bool:
+    return True
+
+
+class FlagOSBackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return _check_flagos_available()
+
+    def is_available(self) -> bool:
+        return _check_flagos_available()
+
+    def get_flash_attention_class(self):
+        from .attention.dot_product_attention.backends import FlashAttentionFL
+        return FlashAttentionFL
+
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        return generic_gemm_fl(
+            A, transA, B, transB, D, quantizer, output_dtype,
+            bias, bias_type, gelu, gelu_in, grad,
+            workspace, workspace_size, accumulate, use_split_accumulator,
+            comm_overlap=comm_overlap, comm_type=comm_type,
+            extra_output=extra_output, bulk_overlap=bulk_overlap,
+            alpha=alpha, beta=beta
+        )
+
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        return rmsnorm_fwd_fl(
+            input=input, weight=weight, eps=eps, ln_out=ln_out,
+            quantizer=quantizer, odtype=otype,
+            sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma,
+        )
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return rmsnorm_bwd_fl(
+            dy=dy, x=x, rsigma=rsigma, gamma=gamma,
+            sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma, eps=eps,
+        )
+
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        return multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        result, _ = multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor)
+        return result
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        if chunk_size is None:
+            return multi_tensor_adam_fl
+        return multi_tensor_adam_fl(
+            chunk_size=chunk_size, noop_flag=noop_flag, tensor_lists=tensor_lists,
+            lr=lr, beta1=beta1, beta2=beta2, eps=eps,
+            step=step, mode=mode, bias_correction=bias_correction, weight_decay=weight_decay,
+        )
+
+    def get_cublasLt_version(self) -> int:
+        return 110000
+
+    def get_cudnn_version(self) -> int:
+        return 90000
+
+    def get_num_cublas_streams(self) -> int:
+        return 0
+
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        return NVTE_Fused_Attn_Backend.NVTE_No_Backend
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        return FP8TensorMeta()
+
diff --git a/transformer_engine/plugins/cpp_extensions/__init__.py b/transformer_engine/plugin/core/backends/flagos/impl/__init__.py
similarity index 68%
rename from transformer_engine/plugins/cpp_extensions/__init__.py
rename to transformer_engine/plugin/core/backends/flagos/impl/__init__.py
index 286672141c..f17b38c9e6 100644
--- a/transformer_engine/plugins/cpp_extensions/__init__.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/__init__.py
@@ -2,8 +2,7 @@
 #
 # See LICENSE for license information.
 
-"""Python interface for c++ extensions"""
 from .gemm import *
 from .rmsnorm import *
 from .fused_adam import *
-from .multi_tensor_apply import *
+from .multi_tensor import *
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
new file mode 100644
index 0000000000..1edd361f95
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Optional, List
+import torch
+import flag_gems
+
+
+def multi_tensor_adam_fl(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    lr: float,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    mode: int,
+    bias_correction: int,
+    weight_decay: float,
+    inv_scale: Optional[float] = 1.0,
+    out_dtype: Optional[torch.dtype] = None,
+) -> None:
+    with flag_gems.use_gems():
+        num_lists = len(tensor_lists)
+        assert num_lists in [4, 5], f"Expected 4 or 5 tensor lists, got {num_lists}"
+
+        num_tensors = len(tensor_lists[0])
+        assert num_tensors > 0, "No tensors provided"
+
+        for i, lst in enumerate(tensor_lists):
+            assert len(lst) == num_tensors, f"List {i} has {len(lst)} tensors, expected {num_tensors}"
+
+        bias_correction1 = 1.0
+        bias_correction2 = 1.0
+        if bias_correction == 1:
+            bias_correction1 = 1 - beta1 ** step
+            bias_correction2 = 1 - beta2 ** step
+
+        is_adamw = (mode == 1)
+
+        for i in range(num_tensors):
+            g = tensor_lists[0][i]
+            p = tensor_lists[1][i]
+            m = tensor_lists[2][i]
+            v = tensor_lists[3][i]
+            p_master = tensor_lists[4][i] if num_lists == 5 else None
+
+            if not g.is_contiguous():
+                g = g.contiguous()
+
+            if inv_scale is not None and inv_scale != 1.0:
+                g = g * inv_scale
+
+            m.mul_(beta1).add_(g, alpha=1 - beta1)
+            v.mul_(beta2).add_(g.mul(g).mul_(1 - beta2))
+
+            m_corr = m.clone()
+            v_corr = v.clone()
+            if bias_correction == 1:
+                m_corr = m_corr / bias_correction1
+                v_corr = v_corr / bias_correction2
+
+            update = m_corr / (v_corr.sqrt() + eps)
+
+            if is_adamw:
+                p.data.mul_(1 - lr * weight_decay)
+            else:
+                update.add_(p, alpha=weight_decay)
+
+            p.data.add_(update, alpha=-lr)
+
+            if p_master is not None:
+                p_master.data.copy_(p.data)
+                out_dtype = p_master.dtype if out_dtype is None else out_dtype
+                p.data = p.data.to(out_dtype)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
new file mode 100644
index 0000000000..a52af3d4c2
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+
+import flag_gems
+
+__all__ = [
+    "generic_gemm_fl",
+]
+
+_DTYPE_TO_TORCH = {
+    0: torch.uint8,
+    2: torch.int32,
+    4: torch.float32,
+    5: torch.float16,
+    6: torch.bfloat16,
+    7: torch.float8_e4m3fn,
+    8: torch.float8_e5m2,
+}
+
+def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
+    if required:
+        return scale if scale is not None else 1.0
+    if scale not in (0.0, None):
+        raise ValueError("scale must be zero")
+    return 0.0
+
+def _convert_dtype(dtype: Union[int, torch.dtype, None]) -> Optional[torch.dtype]:
+    if dtype is None:
+        return None
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, int):
+        return _DTYPE_TO_TORCH.get(dtype, None)
+    if hasattr(dtype, 'value'):
+        return _DTYPE_TO_TORCH.get(dtype.value, None)
+    return None
+
+def generic_gemm_fl(
+    A: torch.Tensor,
+    transA: bool,
+    B: torch.Tensor,
+    transB: bool,
+    D: Optional[torch.Tensor],
+    quantizer: Any,
+    output_dtype: Any,
+    bias: Optional[torch.Tensor],
+    bias_type: Any,
+    gelu: bool,
+    gelu_in: Optional[torch.Tensor],
+    grad: bool,
+    workspace: torch.Tensor,
+    workspace_size: int,
+    accumulate: bool,
+    use_split_accumulator: bool,
+    comm_overlap: Optional[Any] = None,
+    comm_type: Optional[Any] = None,
+    extra_output: Optional[torch.Tensor] = None,
+    bulk_overlap: bool = False,
+    alpha: float = 1.0,
+    beta: Optional[float] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+    with flag_gems.use_gems():
+        assert not gelu and gelu_in is None, "Triton-Based General Gemm do not support gelu now"
+        assert quantizer is None, "Triton-Based General Gemm do not support quantization now"
+        assert bias is None, "Triton-Based General Gemm do not support bias now"
+
+        alpha = validate_gemm_scale(alpha, True)
+        beta = validate_gemm_scale(beta, accumulate)
+
+        s = -1
+        b = -1
+        orig_A_shape = A.shape
+        orig_B_shape = B.shape
+        shape_a_changed = False
+        shape_b_changed = False
+
+        if A.ndim == 3:
+            A = A.view(-1, A.shape[-1])
+            shape_a_changed = True
+
+        if B.ndim == 3:
+            s, b, _ = B.shape
+            B = B.view(-1, B.shape[-1])
+            shape_b_changed = True
+
+        A_comp = A.T if transA else A
+        B_comp = B.T if transB else B
+
+        out1 = flag_gems.mm(B_comp, A_comp)
+
+        if shape_b_changed:
+            out1 = out1.view(s, b, -1)
+
+        torch_out_dtype = _convert_dtype(output_dtype)
+        if torch_out_dtype is not None and out1.dtype != torch_out_dtype:
+            out1 = out1.to(torch_out_dtype)
+
+        bias_grad = None
+        gelu_input = None
+        extra_output_ret = None
+
+        if D is not None:
+            if accumulate:
+                D.add_(out1)
+            else:
+                D.copy_(out1)
+            return D, bias_grad, gelu_input, extra_output_ret
+        else:
+            return out1, bias_grad, gelu_input, extra_output_ret
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
new file mode 100644
index 0000000000..9d3e6959b6
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+from torch.distributed._tensor import DTensor
+import flag_gems
+
+
+def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
+    with flag_gems.use_gems():
+        tensors = tensor_lists[0]
+
+        if per_tensor:
+            norms = [torch.norm(t.float(), p=2) for t in tensors]
+            return norms, None
+        else:
+            total_norm_sq = sum(torch.sum(t.float() ** 2) for t in tensors)
+            total_norm = torch.sqrt(total_norm_sq)
+            return total_norm, None
+
+
+def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
+    with flag_gems.use_gems():
+        for src, dst in zip(tensor_lists[0], tensor_lists[1]):
+            dst.copy_(src * scale)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
new file mode 100644
index 0000000000..ddf70f2c70
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+import flag_gems
+
+
+def rmsnorm_fwd_fl(
+    input,
+    weight,
+    eps,
+    ln_out,
+    quantizer,
+    odtype,
+    sm_margin,
+    zero_centered_gamma,
+):
+    with flag_gems.use_gems():
+        if zero_centered_gamma:
+            weight_adj = 1 + weight
+        else:
+            weight_adj = weight
+
+        y, rstdevs = flag_gems.rms_norm_forward(
+            input,
+            [input.shape[-1]],
+            weight_adj,
+            eps,
+        )
+
+        if rstdevs.shape != input.shape[:-1]:
+            rstdevs = rstdevs.view(input.shape[:-1])
+
+        return y, None, rstdevs
+
+
+def rmsnorm_bwd_fl(
+    dy,
+    x,
+    rsigma,
+    gamma,
+    sm_margin,
+    zero_centered_gamma,
+    eps,
+):
+    with flag_gems.use_gems():
+        # When zero_centered_gamma is True, forward uses (1 + gamma) as weight
+        # So backward needs to use (1 + gamma) for computing dx
+        if zero_centered_gamma:
+            gamma_adj = 1 + gamma
+        else:
+            gamma_adj = gamma
+
+        dx, dw = flag_gems.rms_norm_backward(
+            dy,
+            x,
+            rsigma,
+            [x.shape[-1]],
+            gamma_adj,
+            eps,
+        )
+        return dx, dw
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
new file mode 100644
index 0000000000..5e2242f70a
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+FlagOS backend operator registrations.
+
+This module registers all DEFAULT (FlagOS) implementations.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ...types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all FlagOS (DEFAULT) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    from .flagos import FlagOSBackend
+
+    # Create a backend instance to access the methods
+    backend = FlagOSBackend()
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        OpImpl(op_name="rmsnorm_fwd", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="rmsnorm_bwd", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="generic_gemm", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="multi_tensor_scale", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="multi_tensor_adam", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="multi_tensor_l2norm", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor=None, priority=150),
+
+        # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=150),
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/reference/__init__.py b/transformer_engine/plugin/core/backends/reference/__init__.py
new file mode 100644
index 0000000000..08844be51b
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .reference import ReferenceBackend
+
+__all__ = ["ReferenceBackend"]
diff --git a/transformer_engine/plugin/core/backends/reference/flash_attention.py b/transformer_engine/plugin/core/backends/reference/flash_attention.py
new file mode 100644
index 0000000000..02aa0754fb
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/flash_attention.py
@@ -0,0 +1,353 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+
+
+class FlashAttentionTorch(FlashAttentionBase):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+    @property
+    def backend_name(self) -> str:
+        return "torch_sdpa"
+
+    def _convert_layout_to_bhsd(
+        self,
+        tensor: torch.Tensor,
+        layout: str,
+    ) -> torch.Tensor:
+        """Convert tensor from various layouts to [batch, heads, seq, dim] format."""
+        layout = layout.lower()
+
+        if layout in ("sbhd", "sbh3d", "sb3hd"):
+            return tensor.permute(1, 2, 0, 3)
+        elif layout in ("bshd", "bsh3d", "bs3hd"):
+            return tensor.permute(0, 2, 1, 3)
+        elif layout == "bhsd":
+            return tensor
+        else:
+            raise ValueError(f"Unsupported qkv_layout: {layout}")
+
+    def _convert_bhsd_to_layout(
+        self,
+        tensor: torch.Tensor,
+        layout: str,
+    ) -> torch.Tensor:
+        """Convert tensor from [batch, heads, seq, dim] back to original layout."""
+        layout = layout.lower()
+
+        if layout in ("sbhd", "sbh3d", "sb3hd"):
+            return tensor.permute(2, 0, 1, 3)
+        elif layout in ("bshd", "bsh3d", "bs3hd"):
+            return tensor.permute(0, 2, 1, 3)
+        elif layout == "bhsd":
+            return tensor
+        else:
+            raise ValueError(f"Unsupported qkv_layout: {layout}")
+
+    def _create_sliding_window_mask(
+        self,
+        seq_len_q: int,
+        seq_len_kv: int,
+        window_size: Tuple[int, int],
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        """Create a sliding window attention mask."""
+        left_window, right_window = window_size
+
+        if left_window == -1 and right_window == -1:
+            return torch.zeros(seq_len_q, seq_len_kv, dtype=dtype, device=device)
+
+        q_idx = torch.arange(seq_len_q, device=device).unsqueeze(1)
+        kv_idx = torch.arange(seq_len_kv, device=device).unsqueeze(0)
+
+        mask_bool = torch.zeros(seq_len_q, seq_len_kv, dtype=torch.bool, device=device)
+
+        if left_window >= 0:
+            mask_bool = mask_bool | (kv_idx < q_idx - left_window)
+
+        if right_window >= 0:
+            mask_bool = mask_bool | (kv_idx > q_idx + right_window)
+
+        mask = torch.zeros(seq_len_q, seq_len_kv, dtype=dtype, device=device)
+        mask.masked_fill_(mask_bool, float('-inf'))
+
+        return mask
+
+    def _unpack_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Convert packed tensor to padded tensor format."""
+        batch_size = cu_seqlens.shape[0] - 1
+        device = tensor.device
+        original_shape = tensor.shape
+
+        if tensor.dim() == 4:
+            if tensor.shape[1] == 1:
+                tensor = tensor.squeeze(1)
+            else:
+                raise ValueError(
+                    f"Unexpected 4D tensor shape {original_shape}. "
+                    f"Expected [total_tokens, 1, num_heads, head_dim]"
+                )
+
+        if tensor.dim() != 3:
+            raise ValueError(
+                f"Expected tensor to be 3D or 4D after processing, got shape {original_shape}"
+            )
+
+        total_tokens, num_heads, head_dim = tensor.shape
+
+        expected_total = cu_seqlens[-1].item()
+        if total_tokens != expected_total:
+            raise ValueError(
+                f"Tensor has {total_tokens} tokens but cu_seqlens indicates {expected_total} tokens"
+            )
+
+        padded_tensor = torch.zeros(
+            batch_size, num_heads, max_seqlen, head_dim,
+            dtype=tensor.dtype, device=device
+        )
+
+        padding_mask = torch.ones(batch_size, max_seqlen, dtype=torch.bool, device=device)
+
+        for i in range(batch_size):
+            start = cu_seqlens[i].item()
+            end = cu_seqlens[i + 1].item()
+            seq_len = end - start
+
+            seq_data = tensor[start:end].permute(1, 0, 2)
+            padded_tensor[i, :, :seq_len, :] = seq_data
+            padding_mask[i, :seq_len] = False
+
+        return padded_tensor, padding_mask
+
+    def _pack_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Convert padded tensor back to packed tensor format."""
+        batch_size = tensor.shape[0]
+        num_heads = tensor.shape[1]
+        head_dim = tensor.shape[3]
+        total_tokens = cu_seqlens[-1].item()
+        device = tensor.device
+
+        packed_tensor = torch.zeros(
+            total_tokens, num_heads, head_dim,
+            dtype=tensor.dtype, device=device
+        )
+
+        for i in range(batch_size):
+            start = cu_seqlens[i].item()
+            end = cu_seqlens[i + 1].item()
+            seq_len = end - start
+
+            seq_data = tensor[i, :, :seq_len, :].permute(1, 0, 2)
+            packed_tensor[start:end, :, :] = seq_data
+
+        return packed_tensor
+
+    def forward(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        """Flash Attention implementation using PyTorch's scaled_dot_product_attention."""
+        if fp8:
+            raise NotImplementedError("FP8 is not supported in PyTorch SDPA backend")
+        if cp_group is not None:
+            raise NotImplementedError("Context parallelism is not supported in PyTorch SDPA backend")
+        if alibi_slopes is not None:
+            raise NotImplementedError("ALiBi slopes are not supported in PyTorch SDPA backend")
+
+        use_packed_format = cu_seqlens_q is not None or cu_seqlens_kv is not None
+        padding_mask_q = None
+        padding_mask_kv = None
+        query_original_shape = query_layer.shape
+
+        if use_packed_format:
+            if cu_seqlens_q is not None:
+                query, padding_mask_q = self._unpack_tensor(query_layer, cu_seqlens_q, max_seqlen_q)
+            else:
+                query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+
+            if cu_seqlens_kv is not None:
+                key, padding_mask_kv = self._unpack_tensor(key_layer, cu_seqlens_kv, max_seqlen_kv)
+                value, _ = self._unpack_tensor(value_layer, cu_seqlens_kv, max_seqlen_kv)
+            else:
+                key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+                value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+        else:
+            query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+            key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+            value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+
+        batch_size, num_heads_q, seq_len_q, head_dim = query.shape
+        num_heads_kv = key.shape[1]
+        seq_len_kv = key.shape[2]
+
+        if num_heads_q != num_heads_kv:
+            num_groups = num_heads_q // num_heads_kv
+            if num_heads_q % num_heads_kv != 0:
+                raise ValueError(
+                    f"num_heads_q ({num_heads_q}) must be divisible by num_heads_kv ({num_heads_kv})"
+                )
+            key = key.repeat_interleave(num_groups, dim=1)
+            value = value.repeat_interleave(num_groups, dim=1)
+
+        attn_mask = None
+        is_causal = False
+
+        if use_packed_format and padding_mask_kv is not None:
+            attn_mask = torch.zeros(
+                batch_size, seq_len_q, seq_len_kv,
+                dtype=query.dtype, device=query.device
+            )
+            padding_broadcast = padding_mask_kv.unsqueeze(1)
+            attn_mask.masked_fill_(padding_broadcast, float('-inf'))
+
+        if attn_mask_type == "causal":
+            if window_size is None and not use_packed_format:
+                is_causal = True
+            else:
+                causal_mask = torch.zeros(
+                    seq_len_q, seq_len_kv,
+                    dtype=query.dtype, device=query.device
+                )
+                causal_mask.masked_fill_(
+                    torch.triu(torch.ones(seq_len_q, seq_len_kv, device=query.device, dtype=torch.bool), diagonal=1),
+                    float('-inf')
+                )
+
+                if attn_mask is not None:
+                    if attn_mask.dim() == 2:
+                        attn_mask = attn_mask + causal_mask
+                    else:
+                        attn_mask = attn_mask + causal_mask.unsqueeze(0)
+                else:
+                    attn_mask = causal_mask
+
+        if window_size is not None and not is_causal:
+            window_mask = self._create_sliding_window_mask(
+                seq_len_q=seq_len_q,
+                seq_len_kv=seq_len_kv,
+                window_size=window_size,
+                device=query.device,
+                dtype=query.dtype,
+            )
+
+            if attn_mask is not None:
+                attn_mask = attn_mask + window_mask.unsqueeze(0)
+            else:
+                attn_mask = window_mask
+
+        if attention_mask is not None and attn_mask_type != "causal":
+            if isinstance(attention_mask, tuple):
+                explicit_mask = attention_mask[0]
+            else:
+                explicit_mask = attention_mask
+
+            if explicit_mask.dtype == torch.bool:
+                float_mask = torch.zeros_like(explicit_mask, dtype=query.dtype)
+                float_mask.masked_fill_(~explicit_mask, float('-inf'))
+                explicit_mask = float_mask
+
+            if explicit_mask.dim() == 2:
+                explicit_mask = explicit_mask.unsqueeze(0).unsqueeze(0)
+            elif explicit_mask.dim() == 3:
+                explicit_mask = explicit_mask.unsqueeze(1)
+
+            if attn_mask is not None:
+                if attn_mask.dim() == 2:
+                    attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+                elif attn_mask.dim() == 3:
+                    attn_mask = attn_mask.unsqueeze(1)
+                attn_mask = attn_mask + explicit_mask
+            else:
+                attn_mask = explicit_mask
+        elif attn_mask is not None:
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+            elif attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(1)
+
+        with self.attention_dropout_ctx():
+            dropout_p = self.attention_dropout if self.training else 0.0
+
+            output = F.scaled_dot_product_attention(
+                query=query,
+                key=key,
+                value=value,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+                scale=self.softmax_scale,
+            )
+
+        if use_packed_format and padding_mask_q is not None:
+            mask_expanded = padding_mask_q.unsqueeze(1).unsqueeze(3)
+            output = output.masked_fill(mask_expanded, 0.0)
+
+        if use_packed_format and cu_seqlens_q is not None:
+            output = self._pack_tensor(output, cu_seqlens_q)
+
+            if len(query_original_shape) == 4:
+                total_tokens = output.shape[0]
+                hidden_size = output.shape[1] * output.shape[2]
+                output = output.contiguous().view(total_tokens, 1, hidden_size)
+        else:
+            output = self._convert_bhsd_to_layout(output, qkv_layout)
+            # Flatten the last two dimensions (heads, dim) -> (heads * dim)
+            # to match the output format of other backends
+            output = output.contiguous().view(*output.shape[:-2], -1)
+
+        return output
diff --git a/transformer_engine/plugin/core/backends/reference/impl/__init__.py b/transformer_engine/plugin/core/backends/reference/impl/__init__.py
new file mode 100644
index 0000000000..6eb29b6f90
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/__init__.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .gemm import general_gemm_torch
+
+from .rmsnorm import rmsnorm_fwd_torch, rmsnorm_bwd_torch
+from .normalization import layernorm_fwd_torch, layernorm_bwd_torch
+
+from .activation import (
+    gelu_torch, geglu_torch, qgelu_torch, qgeglu_torch,
+    relu_torch, reglu_torch, srelu_torch, sreglu_torch,
+    silu_torch, swiglu_torch, clamped_swiglu_torch,
+    dgelu_torch, dgeglu_torch, dqgelu_torch, dqgeglu_torch,
+    drelu_torch, dreglu_torch, dsrelu_torch, dsreglu_torch,
+    dsilu_torch, dswiglu_torch, clamped_dswiglu_torch,
+    dbias_dgelu_torch, dbias_dsilu_torch, dbias_drelu_torch,
+    dbias_dqgelu_torch, dbias_dsrelu_torch,
+)
+
+from .softmax import (
+    scaled_softmax_forward_torch,
+    scaled_softmax_backward_torch,
+    scaled_masked_softmax_forward_torch,
+    scaled_masked_softmax_backward_torch,
+    scaled_upper_triang_masked_softmax_forward_torch,
+    scaled_upper_triang_masked_softmax_backward_torch,
+    scaled_aligned_causal_masked_softmax_forward_torch,
+    scaled_aligned_causal_masked_softmax_backward_torch,
+)
+
+from .dropout import dropout_fwd_torch, dropout_bwd_torch
+
+from .optimizer import (
+    multi_tensor_scale_torch,
+    multi_tensor_l2norm_torch,
+    multi_tensor_adam_torch,
+    multi_tensor_sgd_torch,
+    multi_tensor_compute_scale_and_scale_inv_torch,
+)
+
+__all__ = [
+    "general_gemm_torch",
+    "rmsnorm_fwd_torch",
+    "rmsnorm_bwd_torch",
+    "layernorm_fwd_torch",
+    "layernorm_bwd_torch",
+    "gelu_torch",
+    "geglu_torch",
+    "qgelu_torch",
+    "qgeglu_torch",
+    "relu_torch",
+    "reglu_torch",
+    "srelu_torch",
+    "sreglu_torch",
+    "silu_torch",
+    "swiglu_torch",
+    "clamped_swiglu_torch",
+    "dgelu_torch",
+    "dgeglu_torch",
+    "dqgelu_torch",
+    "dqgeglu_torch",
+    "drelu_torch",
+    "dreglu_torch",
+    "dsrelu_torch",
+    "dsreglu_torch",
+    "dsilu_torch",
+    "dswiglu_torch",
+    "clamped_dswiglu_torch",
+    "dbias_dgelu_torch",
+    "dbias_dsilu_torch",
+    "dbias_drelu_torch",
+    "dbias_dqgelu_torch",
+    "dbias_dsrelu_torch",
+    "scaled_softmax_forward_torch",
+    "scaled_softmax_backward_torch",
+    "scaled_masked_softmax_forward_torch",
+    "scaled_masked_softmax_backward_torch",
+    "scaled_upper_triang_masked_softmax_forward_torch",
+    "scaled_upper_triang_masked_softmax_backward_torch",
+    "scaled_aligned_causal_masked_softmax_forward_torch",
+    "scaled_aligned_causal_masked_softmax_backward_torch",
+    "dropout_fwd_torch",
+    "dropout_bwd_torch",
+    "multi_tensor_scale_torch",
+    "multi_tensor_l2norm_torch",
+    "multi_tensor_adam_torch",
+    "multi_tensor_sgd_torch",
+    "multi_tensor_compute_scale_and_scale_inv_torch",
+]
diff --git a/transformer_engine/plugin/core/backends/reference/impl/activation.py b/transformer_engine/plugin/core/backends/reference/impl/activation.py
new file mode 100644
index 0000000000..8c9eb58a31
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/activation.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Optional, Tuple
+import torch
+import torch.nn.functional as F
+
+__all__ = [
+    "gelu_torch",
+    "geglu_torch",
+    "qgelu_torch",
+    "qgeglu_torch",
+    "relu_torch",
+    "reglu_torch",
+    "srelu_torch",
+    "sreglu_torch",
+    "silu_torch",
+    "swiglu_torch",
+    "clamped_swiglu_torch",
+    "dgelu_torch",
+    "dgeglu_torch",
+    "dqgelu_torch",
+    "dqgeglu_torch",
+    "drelu_torch",
+    "dreglu_torch",
+    "dsrelu_torch",
+    "dsreglu_torch",
+    "dsilu_torch",
+    "dswiglu_torch",
+    "clamped_dswiglu_torch",
+    "dbias_dgelu_torch",
+    "dbias_dsilu_torch",
+    "dbias_drelu_torch",
+    "dbias_dqgelu_torch",
+    "dbias_dsrelu_torch",
+]
+
+
+def gelu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    return F.gelu(input, approximate='tanh')
+
+
+def geglu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = input.chunk(2, dim=-1)
+    return F.gelu(a, approximate='tanh') * b
+
+
+def qgelu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    return input * torch.sigmoid(1.702 * input)
+
+
+def qgeglu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = input.chunk(2, dim=-1)
+    return a * torch.sigmoid(1.702 * a) * b
+
+
+def relu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    return F.relu(input)
+
+
+def reglu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = input.chunk(2, dim=-1)
+    return F.relu(a) * b
+
+
+def srelu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    return torch.square(F.relu(input))
+
+
+def sreglu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = input.chunk(2, dim=-1)
+    return torch.square(F.relu(a)) * b
+
+
+def silu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    return F.silu(input)
+
+
+def swiglu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = input.chunk(2, dim=-1)
+    return F.silu(a) * b
+
+
+def clamped_swiglu_torch(
+    input: torch.Tensor,
+    quantizer: Any,
+    limit: float = 7.0,
+    alpha: float = 1.702,
+) -> torch.Tensor:
+    """Clamped SwiGLU matching CUDA implementation.
+
+    CUDA implementation:
+    - a (activation): clamp to upper bound only: min(a, limit)
+    - b (gate): clamp to [-limit, limit], then add 1
+    - output = (a_clamped * sigmoid(alpha * a_clamped)) * b_clamped
+    """
+    a, b = input.chunk(2, dim=-1)
+    # CUDA only clamps a to upper bound
+    a_clamped = torch.clamp(a, max=limit)
+    # CUDA clamps b to [-limit, limit] and adds 1
+    b_clamped = torch.clamp(b, -limit, limit) + 1
+    return a_clamped * torch.sigmoid(alpha * a_clamped) * b_clamped
+
+
+def dgelu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    x = fwd_input.detach().requires_grad_(True)
+    with torch.enable_grad():
+        y = F.gelu(x, approximate='tanh')
+        y.backward(grad)
+    return x.grad
+
+
+def dgeglu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = fwd_input.chunk(2, dim=-1)
+    a = a.detach().requires_grad_(True)
+    b = b.detach().requires_grad_(True)
+
+    with torch.enable_grad():
+        y = F.gelu(a, approximate='tanh') * b
+        y.backward(grad)
+
+    return torch.cat([a.grad, b.grad], dim=-1)
+
+
+def dqgelu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    x = fwd_input.detach().requires_grad_(True)
+    with torch.enable_grad():
+        y = x * torch.sigmoid(1.702 * x)
+        y.backward(grad)
+    return x.grad
+
+
+def dqgeglu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = fwd_input.chunk(2, dim=-1)
+    a = a.detach().requires_grad_(True)
+    b = b.detach().requires_grad_(True)
+
+    with torch.enable_grad():
+        y = a * torch.sigmoid(1.702 * a) * b
+        y.backward(grad)
+
+    return torch.cat([a.grad, b.grad], dim=-1)
+
+
+def drelu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    return grad * (fwd_input > 0).to(grad.dtype)
+
+
+def dreglu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = fwd_input.chunk(2, dim=-1)
+
+    grad_a = grad * b * (a > 0).to(grad.dtype)
+    grad_b = grad * F.relu(a)
+
+    return torch.cat([grad_a, grad_b], dim=-1)
+
+
+def dsrelu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    relu_x = F.relu(fwd_input)
+    return 2 * grad * relu_x * (fwd_input > 0).to(grad.dtype)
+
+
+def dsreglu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = fwd_input.chunk(2, dim=-1)
+
+    relu_a = F.relu(a)
+    grad_a = grad * b * 2 * relu_a * (a > 0).to(grad.dtype)
+    grad_b = grad * torch.square(relu_a)
+
+    return torch.cat([grad_a, grad_b], dim=-1)
+
+
+def dsilu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    x = fwd_input.detach().requires_grad_(True)
+    with torch.enable_grad():
+        y = F.silu(x)
+        y.backward(grad)
+    return x.grad
+
+
+def dswiglu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
+    a, b = fwd_input.chunk(2, dim=-1)
+    a = a.detach().requires_grad_(True)
+    b = b.detach().requires_grad_(True)
+
+    with torch.enable_grad():
+        y = F.silu(a) * b
+        y.backward(grad)
+
+    return torch.cat([a.grad, b.grad], dim=-1)
+
+
+def clamped_dswiglu_torch(
+    grad: torch.Tensor,
+    fwd_input: torch.Tensor,
+    quantizer: Any,
+    limit: float = 7.0,
+    alpha: float = 1.702,
+) -> torch.Tensor:
+    """Backward pass for clamped SwiGLU matching CUDA implementation.
+
+    CUDA implementation:
+    - a (activation): clamp to upper bound only, derivative is 0 if a > limit
+    - b (gate): clamp to [-limit, limit] and add 1, derivative is 0 outside range
+    """
+    a, b = fwd_input.chunk(2, dim=-1)
+
+    # CUDA only clamps a to upper bound
+    a_clamped = torch.clamp(a, max=limit)
+    # CUDA clamps b to [-limit, limit] and adds 1
+    b_clamped = torch.clamp(b, -limit, limit) + 1
+
+    a_clamped = a_clamped.detach().requires_grad_(True)
+    b_clamped = b_clamped.detach().requires_grad_(True)
+
+    with torch.enable_grad():
+        y = a_clamped * torch.sigmoid(alpha * a_clamped) * b_clamped
+        y.backward(grad)
+
+    # Derivative of a clamp (upper bound only): 0 if a > limit
+    grad_a = a_clamped.grad * (a <= limit).to(grad.dtype)
+    # Derivative of b clamp ([-limit, limit]): 0 outside range
+    grad_b = b_clamped.grad * ((b >= -limit) & (b <= limit)).to(grad.dtype)
+
+    return torch.cat([grad_a, grad_b], dim=-1)
+
+
+def dbias_dgelu_torch(
+    grad: torch.Tensor,
+    fwd_input: torch.Tensor,
+    quantizer: Any,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input = dgelu_torch(grad, fwd_input, quantizer)
+
+    grad_bias = grad.sum(dim=tuple(range(grad.ndim - 1)))
+
+    return grad_input, grad_bias
+
+
+def dbias_dsilu_torch(
+    grad: torch.Tensor,
+    fwd_input: torch.Tensor,
+    quantizer: Any,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input = dsilu_torch(grad, fwd_input, quantizer)
+
+    grad_bias = grad.sum(dim=tuple(range(grad.ndim - 1)))
+
+    return grad_input, grad_bias
+
+
+def dbias_drelu_torch(
+    grad: torch.Tensor,
+    fwd_input: torch.Tensor,
+    quantizer: Any,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input = drelu_torch(grad, fwd_input, quantizer)
+
+    grad_bias = grad.sum(dim=tuple(range(grad.ndim - 1)))
+
+    return grad_input, grad_bias
+
+
+def dbias_dqgelu_torch(
+    grad: torch.Tensor,
+    fwd_input: torch.Tensor,
+    quantizer: Any,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input = dqgelu_torch(grad, fwd_input, quantizer)
+
+    grad_bias = grad.sum(dim=tuple(range(grad.ndim - 1)))
+
+    return grad_input, grad_bias
+
+
+def dbias_dsrelu_torch(
+    grad: torch.Tensor,
+    fwd_input: torch.Tensor,
+    quantizer: Any,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input = dsrelu_torch(grad, fwd_input, quantizer)
+
+    grad_bias = grad.sum(dim=tuple(range(grad.ndim - 1)))
+
+    return grad_input, grad_bias
diff --git a/transformer_engine/plugin/core/backends/reference/impl/dropout.py b/transformer_engine/plugin/core/backends/reference/impl/dropout.py
new file mode 100644
index 0000000000..1acea164d8
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/dropout.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+
+__all__ = [
+    "dropout_fwd_torch",
+    "dropout_bwd_torch",
+]
+
+
+def dropout_fwd_torch(
+    input: torch.Tensor,
+    dropout_probability: float,
+    out: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if dropout_probability == 0.0:
+        output = input.clone() if out is None else input.clone().to(out)
+        mask = torch.ones_like(input, dtype=torch.uint8)
+        return output, mask
+
+    mask = torch.bernoulli(
+        torch.full_like(input, 1.0 - dropout_probability)
+    ).to(torch.uint8)
+
+    scale = 1.0 / (1.0 - dropout_probability)
+    output = input * mask.to(input.dtype) * scale
+
+    if out is not None:
+        out.copy_(output)
+        output = out
+
+    return output, mask
+
+
+def dropout_bwd_torch(
+    grad_output: torch.Tensor,
+    mask: torch.Tensor,
+    dropout_probability: float,
+    grad_input: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if dropout_probability == 0.0:
+        return grad_output.clone() if grad_input is None else grad_output.clone().to(grad_input)
+
+    scale = 1.0 / (1.0 - dropout_probability)
+    grad = grad_output * mask.to(grad_output.dtype) * scale
+
+    if grad_input is not None:
+        grad_input.copy_(grad)
+        grad = grad_input
+
+    return grad
diff --git a/transformer_engine/plugin/core/backends/reference/impl/gemm.py b/transformer_engine/plugin/core/backends/reference/impl/gemm.py
new file mode 100644
index 0000000000..ab4540162b
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/gemm.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Optional, Tuple, Union
+import torch
+
+__all__ = [
+    "general_gemm_torch",
+]
+
+_DTYPE_TO_TORCH = {
+    0: torch.uint8,
+    2: torch.int32,
+    4: torch.float32,
+    5: torch.float16,
+    6: torch.bfloat16,
+    7: torch.float8_e4m3fn,
+    8: torch.float8_e5m2,
+}
+
+
+def _convert_dtype(dtype: Union[int, torch.dtype, None]) -> Optional[torch.dtype]:
+    if dtype is None:
+        return None
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, int):
+        return _DTYPE_TO_TORCH.get(dtype, None)
+    if hasattr(dtype, 'value'):
+        return _DTYPE_TO_TORCH.get(dtype.value, None)
+    return None
+
+
+def general_gemm_torch(
+    A: torch.Tensor,
+    transA: bool,
+    B: torch.Tensor,
+    transB: bool,
+    D: Optional[torch.Tensor],
+    quantizer: Any,
+    output_dtype: Any,
+    bias: Optional[torch.Tensor],
+    bias_type: Any,
+    gelu: bool,
+    gelu_in: Optional[torch.Tensor],
+    grad: bool,
+    workspace: torch.Tensor,
+    workspace_size: int,
+    accumulate: bool,
+    use_split_accumulator: bool,
+    comm_overlap: Optional[Any] = None,
+    comm_type: Optional[Any] = None,
+    extra_output: Optional[torch.Tensor] = None,
+    bulk_overlap: bool = False,
+    alpha: float = 1.0,
+    beta: Optional[float] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+    import torch.nn.functional as F
+
+    target_device = B.device
+
+    if A.device != target_device:
+        A = A.to(target_device)
+
+    original_B_shape = None
+    if B.ndim == 3:
+        original_B_shape = B.shape
+        B = B.reshape(-1, B.shape[-1])
+
+    if A.ndim == 3:
+        A = A.reshape(-1, A.shape[-1])
+
+    A_comp = A.T if transA else A
+    B_comp = B.T if transB else B
+
+    if A_comp.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        compute_dtype = torch.bfloat16
+        A_comp = A_comp.to(compute_dtype)
+        B_comp = B_comp.to(compute_dtype)
+
+    out = torch.mm(B_comp, A_comp)
+
+    if alpha != 1.0:
+        out = out * alpha
+
+    if original_B_shape is not None:
+        out = out.view(original_B_shape[0], original_B_shape[1], -1)
+
+    gelu_input_ret = None
+    if gelu and gelu_in is not None:
+        pass
+
+    if bias is not None:
+        if bias.device != target_device:
+            bias = bias.to(target_device)
+        out = out + bias
+
+    if gelu:
+        if gelu_in is not None:
+            gelu_in.copy_(out)
+            gelu_input_ret = gelu_in
+        else:
+            gelu_input_ret = out.clone()
+        out = F.gelu(out, approximate='tanh')
+
+    torch_out_dtype = _convert_dtype(output_dtype)
+    if torch_out_dtype is not None and out.dtype != torch_out_dtype:
+        out = out.to(torch_out_dtype)
+
+    if D is not None:
+        if D.device != target_device:
+            D = D.to(target_device)
+        if accumulate:
+            beta_val = beta if beta is not None else 1.0
+            D.mul_(beta_val).add_(out)
+            out = D
+        else:
+            D.copy_(out)
+            out = D
+
+    bias_grad = None
+    if grad and bias is not None:
+        pass
+
+    extra_output_ret = None
+
+    return out, bias_grad, gelu_input_ret, extra_output_ret
diff --git a/transformer_engine/plugin/core/backends/reference/impl/normalization.py b/transformer_engine/plugin/core/backends/reference/impl/normalization.py
new file mode 100644
index 0000000000..6ab7a7648c
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/normalization.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Optional, Tuple
+import torch
+import torch.nn.functional as F
+
+__all__ = [
+    "layernorm_fwd_torch",
+    "layernorm_bwd_torch",
+]
+
+
+def layernorm_fwd_torch(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    eps: float,
+    ln_out: Optional[torch.Tensor],
+    quantizer: Any,
+    odtype: torch.dtype,
+    sm_margin: int,
+    zero_centered_gamma: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    mean = input.mean(dim=-1, keepdim=True)
+    var = input.var(dim=-1, keepdim=True, unbiased=False)
+    rsigma = torch.rsqrt(var + eps)
+
+    normalized = (input - mean) * rsigma
+
+    if zero_centered_gamma:
+        output = normalized * (1.0 + weight)
+    else:
+        output = normalized * weight
+
+    if bias is not None:
+        output = output + bias
+
+    if output.dtype != odtype:
+        output = output.to(odtype)
+
+    mean = mean.squeeze(-1)
+    rsigma = rsigma.squeeze(-1)
+
+    return output, mean, rsigma
+
+
+def layernorm_bwd_torch(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    mu: torch.Tensor,
+    rsigma: torch.Tensor,
+    gamma: torch.Tensor,
+    sm_margin: int = 0,
+    zero_centered_gamma: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if mu.ndim < x.ndim:
+        mu = mu.unsqueeze(-1)
+    if rsigma.ndim < x.ndim:
+        rsigma = rsigma.unsqueeze(-1)
+
+    x_normalized = (x - mu) * rsigma
+
+    N = x.shape[-1]
+
+    if zero_centered_gamma:
+        gamma_adj = 1.0 + gamma
+    else:
+        gamma_adj = gamma
+
+    dy_gamma = dy * gamma_adj
+
+    mean_dy_gamma = dy_gamma.mean(dim=-1, keepdim=True)
+
+    mean_dy_gamma_x = (dy_gamma * x_normalized).mean(dim=-1, keepdim=True)
+
+    dx = rsigma * (dy_gamma - mean_dy_gamma - x_normalized * mean_dy_gamma_x)
+
+    dgamma = (dy * x_normalized).sum(dim=tuple(range(dy.ndim - 1)))
+
+    dbeta = dy.sum(dim=tuple(range(dy.ndim - 1)))
+
+    return dx, dgamma, dbeta
diff --git a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
new file mode 100644
index 0000000000..100c6c9ef3
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import List, Union
+import torch
+
+__all__ = [
+    "multi_tensor_scale_torch",
+    "multi_tensor_l2norm_torch",
+    "multi_tensor_adam_torch",
+    "multi_tensor_sgd_torch",
+    "multi_tensor_compute_scale_and_scale_inv_torch",
+]
+
+
+def multi_tensor_scale_torch(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    scale: float,
+) -> None:
+    if noop_flag.item() != 0:
+        return
+
+    if len(tensor_lists) != 2:
+        raise ValueError("tensor_lists should contain [input_tensors, output_tensors]")
+
+    input_tensors, output_tensors = tensor_lists
+
+    if len(output_tensors) != len(input_tensors):
+        raise ValueError("Output and input tensor lists must have the same length")
+
+    for in_tensor, out_tensor in zip(input_tensors, output_tensors):
+        out_tensor.copy_(in_tensor * scale)
+
+
+def multi_tensor_l2norm_torch(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    per_tensor: bool = False,
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    if noop_flag.item() != 0:
+        if per_tensor:
+            return [torch.tensor(0.0, device=t.device) for t in tensor_lists[0]]
+        else:
+            return torch.tensor(0.0, device=tensor_lists[0][0].device)
+
+    tensors = tensor_lists[0]
+
+    if per_tensor:
+        norms = []
+        for tensor in tensors:
+            norm = torch.norm(tensor.float(), p=2)
+            norms.append(norm)
+        return norms
+    else:
+        total_norm_sq = torch.tensor(0.0, device=tensors[0].device)
+        for tensor in tensors:
+            total_norm_sq += torch.sum(tensor.float() ** 2)
+        return torch.sqrt(total_norm_sq)
+
+
+def multi_tensor_adam_torch(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    lr: float,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    mode: int,
+    bias_correction: int,
+    weight_decay: float,
+) -> None:
+    if noop_flag.item() != 0:
+        return
+
+    if len(tensor_lists) != 4:
+        raise ValueError("tensor_lists should contain [grads, params, exp_avgs, exp_avg_sqs]")
+
+    grads, params, exp_avgs, exp_avg_sqs = tensor_lists
+
+    if not (len(params) == len(grads) == len(exp_avgs) == len(exp_avg_sqs)):
+        raise ValueError("All tensor lists must have the same length")
+
+    if bias_correction:
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+    else:
+        bias_correction1 = 1.0
+        bias_correction2 = 1.0
+
+    for grad, param, exp_avg, exp_avg_sq in zip(grads, params, exp_avgs, exp_avg_sqs):
+        if grad is None:
+            continue
+
+        if mode == 1 and weight_decay != 0:
+            param.mul_(1 - lr * weight_decay)
+
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+        corrected_exp_avg = exp_avg / bias_correction1
+        corrected_exp_avg_sq = exp_avg_sq / bias_correction2
+
+        denom = corrected_exp_avg_sq.sqrt().add_(eps)
+        param.addcdiv_(corrected_exp_avg, denom, value=-lr)
+
+
+def multi_tensor_sgd_torch(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    lr: float,
+    momentum: float,
+    dampening: float,
+    weight_decay: float,
+    nesterov: bool,
+) -> None:
+    if noop_flag.item() != 0:
+        return
+
+    if len(tensor_lists) != 3:
+        raise ValueError("tensor_lists should contain [params, grads, momentum_buffers]")
+
+    params, grads, momentum_buffers = tensor_lists
+
+    if not (len(params) == len(grads) == len(momentum_buffers)):
+        raise ValueError("All tensor lists must have the same length")
+
+    for param, grad, buf in zip(params, grads, momentum_buffers):
+        if grad is None:
+            continue
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        if momentum != 0:
+            if buf is None or buf.numel() == 0:
+                buf = grad.clone().detach()
+            else:
+                buf.mul_(momentum).add_(grad, alpha=1 - dampening)
+
+            if nesterov:
+                grad = grad.add(buf, alpha=momentum)
+            else:
+                grad = buf
+
+        param.add_(grad, alpha=-lr)
+
+
+def multi_tensor_compute_scale_and_scale_inv_torch(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    max_fp8: float,
+    force_pow_2_scales: bool = False,
+    amax_epsilon: float = 0.0,
+) -> None:
+    """
+    Compute scale and scale_inv from amax values for FP8 quantization.
+
+    Args:
+        chunk_size: Chunk size (unused in PyTorch implementation)
+        noop_flag: If non-zero, skip computation
+        tensor_lists: [amaxes, scales, scale_invs]
+        max_fp8: Maximum representable value in FP8 format (e.g., 448.0 for E4M3)
+        force_pow_2_scales: If True, force scales to be powers of 2
+        amax_epsilon: Small epsilon to add to amax to avoid division by zero
+    """
+    if noop_flag.item() != 0:
+        return
+
+    if len(tensor_lists) != 3:
+        raise ValueError("tensor_lists should contain [amaxes, scales, scale_invs]")
+
+    amaxes, scales, scale_invs = tensor_lists
+
+    if not (len(amaxes) == len(scales) == len(scale_invs)):
+        raise ValueError("All tensor lists must have the same length")
+
+    for amax, scale, scale_inv in zip(amaxes, scales, scale_invs):
+        # Add epsilon to avoid division by zero
+        amax_val = amax + amax_epsilon
+
+        # Compute scale: max_fp8 / amax
+        # Clamp amax to avoid very small values
+        amax_val = torch.clamp(amax_val, min=1e-12)
+        computed_scale = max_fp8 / amax_val
+
+        if force_pow_2_scales:
+            # Round scale to nearest power of 2
+            log2_scale = torch.log2(computed_scale)
+            log2_scale = torch.round(log2_scale)
+            computed_scale = torch.pow(2.0, log2_scale)
+
+        # Update scale and scale_inv
+        scale.copy_(computed_scale)
+        scale_inv.copy_(1.0 / computed_scale)
diff --git a/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py
new file mode 100644
index 0000000000..7ae420e7f3
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+
+__all__ = [
+    "rmsnorm_fwd_torch",
+    "rmsnorm_bwd_torch",
+]
+
+
+def rmsnorm_fwd_torch(
+    input,
+    weight,
+    eps,
+    ln_out,
+    quantizer,
+    odtype,
+    sm_margin,
+    zero_centered_gamma,
+):
+    if weight.device != input.device:
+        weight = weight.to(input.device)
+
+    variance = input.pow(2).mean(-1, keepdim=True)
+    inv_rms = torch.rsqrt(variance + eps)
+    y = input * inv_rms
+    if zero_centered_gamma:
+        y = y * (1 + weight)
+    else:
+        y = y * weight
+
+    rstdevs = inv_rms.squeeze(-1)
+
+    return y, None, rstdevs
+
+
+def rmsnorm_bwd_torch(
+    dy,
+    x,
+    rsigma,
+    gamma,
+    sm_margin,
+    zero_centered_gamma,
+    eps,
+):
+    inv_rms = rsigma.unsqueeze(-1)
+
+    x_norm = x * inv_rms
+
+    if zero_centered_gamma:
+        weight = 1 + gamma
+    else:
+        weight = gamma
+
+    dw = (dy * x_norm).sum(dim=tuple(range(dy.ndim - 1)))
+
+    dy_weighted = dy * weight
+
+    mean_term = (dy_weighted * x_norm).mean(-1, keepdim=True)
+    dx = inv_rms * (dy_weighted - x_norm * mean_term)
+    return dx, dw
diff --git a/transformer_engine/plugin/core/backends/reference/impl/softmax.py b/transformer_engine/plugin/core/backends/reference/impl/softmax.py
new file mode 100644
index 0000000000..0b1c6ef4f0
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/impl/softmax.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Optional
+import torch
+import torch.nn.functional as F
+
+__all__ = [
+    "scaled_softmax_forward_torch",
+    "scaled_softmax_backward_torch",
+    "scaled_masked_softmax_forward_torch",
+    "scaled_masked_softmax_backward_torch",
+    "scaled_upper_triang_masked_softmax_forward_torch",
+    "scaled_upper_triang_masked_softmax_backward_torch",
+    "scaled_aligned_causal_masked_softmax_forward_torch",
+    "scaled_aligned_causal_masked_softmax_backward_torch",
+]
+
+
+def scaled_softmax_forward_torch(input: torch.Tensor, scale: float) -> torch.Tensor:
+    return F.softmax(input * scale, dim=-1)
+
+
+def scaled_softmax_backward_torch(
+    output_grad: torch.Tensor,
+    softmax_output: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    # Compute in float32 for numerical stability (matching CUDA behavior)
+    orig_dtype = output_grad.dtype
+    output_grad_f32 = output_grad.float()
+    softmax_output_f32 = softmax_output.float()
+
+    grad_softmax = softmax_output_f32 * (
+        output_grad_f32 - (softmax_output_f32 * output_grad_f32).sum(dim=-1, keepdim=True)
+    )
+
+    return (grad_softmax * scale).to(orig_dtype)
+
+
+def scaled_masked_softmax_forward_torch(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    # Handle uint8 mask (CUDA format: 1=masked, 0=unmasked)
+    # Convert to additive mask (-10000 for masked positions, 0 for unmasked)
+    if mask.dtype == torch.uint8:
+        additive_mask = torch.zeros_like(input, dtype=input.dtype)
+        # Expand mask if needed (mask shape: batch, 1, seq_q, seq_k)
+        if mask.dim() == 4 and mask.size(1) == 1 and input.dim() == 4:
+            mask = mask.expand_as(input)
+        additive_mask = additive_mask.masked_fill(mask.bool(), -10000.0)
+    else:
+        additive_mask = mask
+
+    scaled_input = input * scale + additive_mask
+
+    return F.softmax(scaled_input, dim=-1)
+
+
+def scaled_masked_softmax_backward_torch(
+    output_grad: torch.Tensor,
+    softmax_output: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    # Compute in float32 for numerical stability (matching CUDA behavior)
+    orig_dtype = output_grad.dtype
+    output_grad_f32 = output_grad.float()
+    softmax_output_f32 = softmax_output.float()
+
+    grad_softmax = softmax_output_f32 * (
+        output_grad_f32 - (softmax_output_f32 * output_grad_f32).sum(dim=-1, keepdim=True)
+    )
+
+    return (grad_softmax * scale).to(orig_dtype)
+
+
+def scaled_upper_triang_masked_softmax_forward_torch(
+    input: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    seq_len = input.size(-1)
+
+    causal_mask = torch.triu(
+        torch.full((seq_len, seq_len), float('-inf'), device=input.device, dtype=input.dtype),
+        diagonal=1
+    )
+
+    scaled_input = input * scale + causal_mask
+
+    return F.softmax(scaled_input, dim=-1)
+
+
+def scaled_upper_triang_masked_softmax_backward_torch(
+    output_grad: torch.Tensor,
+    softmax_output: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    # Compute in float32 for numerical stability (matching CUDA behavior)
+    orig_dtype = output_grad.dtype
+    output_grad_f32 = output_grad.float()
+    softmax_output_f32 = softmax_output.float()
+
+    grad_softmax = softmax_output_f32 * (
+        output_grad_f32 - (softmax_output_f32 * output_grad_f32).sum(dim=-1, keepdim=True)
+    )
+
+    return (grad_softmax * scale).to(orig_dtype)
+
+
+def scaled_aligned_causal_masked_softmax_forward_torch(
+    input: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    return scaled_upper_triang_masked_softmax_forward_torch(input, scale)
+
+
+def scaled_aligned_causal_masked_softmax_backward_torch(
+    output_grad: torch.Tensor,
+    softmax_output: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    # Compute in float32 for numerical stability (matching CUDA behavior)
+    orig_dtype = output_grad.dtype
+    output_grad_f32 = output_grad.float()
+    softmax_output_f32 = softmax_output.float()
+
+    grad_softmax = softmax_output_f32 * (
+        output_grad_f32 - (softmax_output_f32 * output_grad_f32).sum(dim=-1, keepdim=True)
+    )
+
+    return (grad_softmax * scale).to(orig_dtype)
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
new file mode 100644
index 0000000000..56da602f8e
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -0,0 +1,508 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from ...ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+
+from .impl import (
+    general_gemm_torch,
+    rmsnorm_fwd_torch, rmsnorm_bwd_torch,
+    layernorm_fwd_torch, layernorm_bwd_torch,
+    gelu_torch, geglu_torch, qgelu_torch, qgeglu_torch,
+    relu_torch, reglu_torch, srelu_torch, sreglu_torch,
+    silu_torch, swiglu_torch, clamped_swiglu_torch,
+    dgelu_torch, dgeglu_torch, dqgelu_torch, dqgeglu_torch,
+    drelu_torch, dreglu_torch, dsrelu_torch, dsreglu_torch,
+    dsilu_torch, dswiglu_torch, clamped_dswiglu_torch,
+    dbias_dgelu_torch, dbias_dsilu_torch, dbias_drelu_torch,
+    dbias_dqgelu_torch, dbias_dsrelu_torch,
+    scaled_softmax_forward_torch, scaled_softmax_backward_torch,
+    scaled_masked_softmax_forward_torch, scaled_masked_softmax_backward_torch,
+    scaled_upper_triang_masked_softmax_forward_torch,
+    scaled_upper_triang_masked_softmax_backward_torch,
+    scaled_aligned_causal_masked_softmax_forward_torch,
+    scaled_aligned_causal_masked_softmax_backward_torch,
+    dropout_fwd_torch, dropout_bwd_torch,
+    multi_tensor_scale_torch, multi_tensor_l2norm_torch,
+    multi_tensor_adam_torch, multi_tensor_sgd_torch,
+)
+
+class ReferenceBackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return True
+
+    def is_available(self) -> bool:
+        return True
+
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionTorch
+        return FlashAttentionTorch
+
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        return general_gemm_torch(
+            A=A,
+            transA=transA,
+            B=B,
+            transB=transB,
+            D=D,
+            quantizer=quantizer,
+            output_dtype=output_dtype,
+            bias=bias,
+            bias_type=bias_type,
+            gelu=gelu,
+            gelu_in=gelu_in,
+            grad=grad,
+            workspace=workspace,
+            workspace_size=workspace_size,
+            accumulate=accumulate,
+            use_split_accumulator=use_split_accumulator,
+            comm_overlap=comm_overlap,
+            comm_type=comm_type,
+            extra_output=extra_output,
+            bulk_overlap=bulk_overlap,
+            alpha=alpha,
+            beta=beta,
+        )
+
+    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("te_general_grouped_gemm - not implemented in reference backend")
+
+    def quantize(self, tensor: torch.Tensor, quantizer: Any, output: Optional[torch.Tensor] = None, noop: Optional[torch.Tensor] = None) -> Any:
+        raise NotImplementedError("quantize - not implemented in reference backend")
+
+    def dequantize(self, input: torch.Tensor, otype: torch.dtype) -> torch.Tensor:
+        raise NotImplementedError("dequantize - not implemented in reference backend")
+
+    def bgrad_quantize(self, input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError("bgrad_quantize - not implemented in reference backend")
+
+    def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return gelu_torch(input, quantizer)
+
+    def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return geglu_torch(input, quantizer)
+
+    def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return qgelu_torch(input, quantizer)
+
+    def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return qgeglu_torch(input, quantizer)
+
+    def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return relu_torch(input, quantizer)
+
+    def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return reglu_torch(input, quantizer)
+
+    def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return srelu_torch(input, quantizer)
+
+    def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return sreglu_torch(input, quantizer)
+
+    def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return silu_torch(input, quantizer)
+
+    def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        return swiglu_torch(input, quantizer)
+
+    def clamped_swiglu(self, input: torch.Tensor, quantizer: Any, limit: float = 7.0, alpha: float = 1.702) -> Any:
+        return clamped_swiglu_torch(input, quantizer, limit, alpha)
+
+    def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dgelu_torch(grad, fwd_input, quantizer)
+
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dgeglu_torch(grad, fwd_input, quantizer)
+
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dqgelu_torch(grad, fwd_input, quantizer)
+
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dqgeglu_torch(grad, fwd_input, quantizer)
+
+    def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return drelu_torch(grad, fwd_input, quantizer)
+
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dreglu_torch(grad, fwd_input, quantizer)
+
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dsrelu_torch(grad, fwd_input, quantizer)
+
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dsreglu_torch(grad, fwd_input, quantizer)
+
+    def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dsilu_torch(grad, fwd_input, quantizer)
+
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        return dswiglu_torch(grad, fwd_input, quantizer)
+
+    def clamped_dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any, limit: float = 7.0, alpha: float = 1.702) -> Any:
+        return clamped_dswiglu_torch(grad, fwd_input, quantizer, limit, alpha)
+
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        return dbias_dgelu_torch(grad, fwd_input, quantizer)
+
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        return dbias_dsilu_torch(grad, fwd_input, quantizer)
+
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        return dbias_drelu_torch(grad, fwd_input, quantizer)
+
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        return dbias_dqgelu_torch(grad, fwd_input, quantizer)
+
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        return dbias_dsrelu_torch(grad, fwd_input, quantizer)
+
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return layernorm_fwd_torch(
+            input=input,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            ln_out=ln_out,
+            quantizer=quantizer,
+            odtype=otype,
+            sm_margin=sm_margin,
+            zero_centered_gamma=zero_centered_gamma,
+        )
+
+    def layernorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return layernorm_bwd_torch(
+            dy=dy,
+            x=x,
+            mu=mu,
+            rsigma=rsigma,
+            gamma=gamma,
+            sm_margin=sm_margin,
+            zero_centered_gamma=zero_centered_gamma,
+        )
+
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        return rmsnorm_fwd_torch(
+            input=input,
+            weight=weight,
+            eps=eps,
+            ln_out=ln_out,
+            quantizer=quantizer,
+            odtype=otype,
+            sm_margin=sm_margin,
+            zero_centered_gamma=zero_centered_gamma,
+        )
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return rmsnorm_bwd_torch(
+            dy=dy,
+            x=x,
+            rsigma=rsigma,
+            gamma=gamma,
+            sm_margin=sm_margin,
+            zero_centered_gamma=zero_centered_gamma,
+            eps=eps,
+        )
+
+    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("rmsnorm_bwd_add - not implemented in reference backend")
+
+    def multi_tensor_quantize(self, tensor_list: List[torch.Tensor], quantizer_list: List[Any]) -> List[Any]:
+        raise NotImplementedError("multi_tensor_quantize - not implemented in reference backend")
+
+    def split_quantize(self, tensor: torch.Tensor, split_sections: List[int], quantizer_list: List[Any]) -> List[Any]:
+        raise NotImplementedError("split_quantize - not implemented in reference backend")
+
+    def moe_permute_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("moe_permute_fwd - not implemented in reference backend")
+
+    def moe_permute_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("moe_permute_bwd - not implemented in reference backend")
+
+    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("moe_unpermute_fwd - not implemented in reference backend")
+
+    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("moe_unpermute_bwd - not implemented in reference backend")
+
+    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_softmax_forward_torch(input, scale)
+
+    def scaled_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_softmax_backward_torch(output_grad, softmax_output, scale)
+
+    def scaled_masked_softmax_forward(self, input: torch.Tensor, mask: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_masked_softmax_forward_torch(input, mask, scale)
+
+    def scaled_masked_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_masked_softmax_backward_torch(output_grad, softmax_output, scale)
+
+    def scaled_upper_triang_masked_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_upper_triang_masked_softmax_forward_torch(input, scale)
+
+    def scaled_upper_triang_masked_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_upper_triang_masked_softmax_backward_torch(output_grad, softmax_output, scale)
+
+    def scaled_aligned_causal_masked_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_aligned_causal_masked_softmax_forward_torch(input, scale)
+
+    def scaled_aligned_causal_masked_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
+        return scaled_aligned_causal_masked_softmax_backward_torch(output_grad, softmax_output, scale)
+
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        return NVTE_Fused_Attn_Backend.NVTE_No_Backend
+
+    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_attn_fwd - not implemented in reference backend")
+
+    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_attn_bwd - not implemented in reference backend")
+
+    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fa_prepare_fwd - not implemented in reference backend")
+
+    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fa_prepare_bwd - not implemented in reference backend")
+
+    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("copy_to_kv_cache - not implemented in reference backend")
+
+    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("convert_thd_to_bshd - not implemented in reference backend")
+
+    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("convert_bshd_to_thd - not implemented in reference backend")
+
+    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_rope_forward - not implemented in reference backend")
+
+    def fused_rope_backward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_rope_backward - not implemented in reference backend")
+
+    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_qkv_rope_forward - not implemented in reference backend")
+
+    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_qkv_rope_backward - not implemented in reference backend")
+
+    def fused_topk_with_score_function_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_topk_with_score_function_fwd - not implemented in reference backend")
+
+    def fused_topk_with_score_function_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_topk_with_score_function_bwd - not implemented in reference backend")
+
+    def fused_score_for_moe_aux_loss_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_score_for_moe_aux_loss_fwd - not implemented in reference backend")
+
+    def fused_score_for_moe_aux_loss_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_score_for_moe_aux_loss_bwd - not implemented in reference backend")
+
+    def fused_moe_aux_loss_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_moe_aux_loss_fwd - not implemented in reference backend")
+
+    def fused_moe_aux_loss_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_moe_aux_loss_bwd - not implemented in reference backend")
+
+    def dropout_fwd(self, input: torch.Tensor, dropout_probability: float, out: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        return dropout_fwd_torch(input, dropout_probability, out)
+
+    def dropout_bwd(self, grad_output: torch.Tensor, mask: torch.Tensor, dropout_probability: float, grad_input: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return dropout_bwd_torch(grad_output, mask, dropout_probability, grad_input)
+
+    def fp8_transpose(self, input: torch.Tensor, dtype: Any, *, out: torch.Tensor) -> None:
+        raise NotImplementedError("fp8_transpose - not implemented in reference backend")
+
+    def swap_first_dims(self, tensor: torch.Tensor, *, out: torch.Tensor) -> None:
+        raise NotImplementedError("swap_first_dims - not implemented in reference backend")
+
+    def compute_amax(self, input: torch.Tensor, amax: torch.Tensor) -> None:
+        raise NotImplementedError("compute_amax - not implemented in reference backend")
+
+    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
+        raise NotImplementedError("fused_amax_and_scale_update_after_reduction - not implemented in reference backend")
+
+    def fp8_block_scaling_compute_partial_amax(self, *args, **kwargs) -> None:
+        raise NotImplementedError("fp8_block_scaling_compute_partial_amax - not implemented in reference backend")
+
+    def fp8_block_scaling_partial_cast(self, *args, **kwargs) -> None:
+        raise NotImplementedError("fp8_block_scaling_partial_cast - not implemented in reference backend")
+
+    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_multi_row_padding - not implemented in reference backend")
+
+    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_multi_row_unpadding - not implemented in reference backend")
+
+    def get_cublasLt_version(self) -> int:
+        return 0
+
+    def get_cudnn_version(self) -> int:
+        return 0
+
+    def get_num_cublas_streams(self) -> int:
+        return 0
+
+    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("thd_read_half_tensor - not implemented in reference backend")
+
+    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("thd_second_half_lse_correction - not implemented in reference backend")
+
+    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("thd_read_second_half_lse - not implemented in reference backend")
+
+    def thd_out_correction(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("thd_out_correction - not implemented in reference backend")
+
+    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("thd_grad_correction - not implemented in reference backend")
+
+    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("thd_get_partitioned_indices - not implemented in reference backend")
+
+    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+        raise NotImplementedError("init_nvshmem_backend - not implemented in reference backend")
+
+    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("create_nvshmem_tensor - not implemented in reference backend")
+
+    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        raise NotImplementedError("nvshmem_send_on_current_stream - not implemented in reference backend")
+
+    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        raise NotImplementedError("nvshmem_wait_on_current_stream - not implemented in reference backend")
+
+    def nvshmem_finalize(self) -> None:
+        raise NotImplementedError("nvshmem_finalize - not implemented in reference backend")
+
+    def multi_tensor_scale(self, chunk_size: int, noop_flag: torch.Tensor, tensor_lists: List[List[torch.Tensor]], scale: float) -> None:
+        return multi_tensor_scale_torch(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(self, chunk_size: int, noop_flag: torch.Tensor, tensor_lists: List[List[torch.Tensor]], per_tensor: bool = False) -> Union[torch.Tensor, List[torch.Tensor]]:
+        return multi_tensor_l2norm_torch(chunk_size, noop_flag, tensor_lists, per_tensor)
+
+    def multi_tensor_unscale_l2norm(self, chunk_size: int, noop_flag: torch.Tensor, tensor_lists: List[List[torch.Tensor]], scale: torch.Tensor, per_tensor: bool = False) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """Compute L2 norm after unscaling.
+
+        Note: scale parameter is actually inv_scale (1/loss_scale).
+        Unscaling means multiplying by inv_scale (= dividing by loss_scale).
+        """
+        if noop_flag.item() != 0:
+            if per_tensor:
+                return [torch.tensor(0.0, device=t.device) for t in tensor_lists[0]]
+            else:
+                return torch.tensor(0.0, device=tensor_lists[0][0].device)
+
+        # Multiply by inv_scale (scale parameter is actually inverse scale)
+        unscaled_tensors = []
+        for tensor in tensor_lists[0]:
+            unscaled_tensors.append(tensor * scale.item())
+
+        return multi_tensor_l2norm_torch(chunk_size, noop_flag, [unscaled_tensors], per_tensor)
+
+    def multi_tensor_adam(self, *args, **kwargs):
+        if not args and not kwargs:
+            return multi_tensor_adam_torch
+        return multi_tensor_adam_torch(*args, **kwargs)
+
+    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+        raise NotImplementedError("multi_tensor_adam_param_remainder - not implemented in reference backend")
+
+    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        raise NotImplementedError("multi_tensor_adam_fp8 - not implemented in reference backend")
+
+    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        raise NotImplementedError("multi_tensor_adam_capturable - not implemented in reference backend")
+
+    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        raise NotImplementedError("multi_tensor_adam_capturable_master - not implemented in reference backend")
+
+    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        return multi_tensor_sgd_torch(*args, **kwargs)
+
+    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        raise NotImplementedError("multi_tensor_compute_scale_and_scale_inv - not implemented in reference backend")
+
+    def bulk_overlap_ag_with_external_gemm(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("bulk_overlap_ag_with_external_gemm - not implemented in reference backend")
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        return FP8TensorMeta()
+
+    def create_comm_overlap_helper(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("create_comm_overlap_helper - not implemented in reference backend")
+
+    def create_comm_overlap(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("create_comm_overlap - not implemented in reference backend")
+
+    def create_comm_overlap_p2p(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("create_comm_overlap_p2p - not implemented in reference backend")
diff --git a/transformer_engine/plugin/core/backends/reference/register_ops.py b/transformer_engine/plugin/core/backends/reference/register_ops.py
new file mode 100644
index 0000000000..43a652843d
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/reference/register_ops.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Reference backend operator registrations.
+
+This module registers all REFERENCE (PyTorch) implementations.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ...types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all PyTorch (REFERENCE) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    from .reference import ReferenceBackend
+
+    # Create a backend instance to access the methods
+    backend = ReferenceBackend()
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # Normalization
+        OpImpl(op_name="rmsnorm_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="rmsnorm_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="rmsnorm_bwd_add", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="layernorm_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="layernorm_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor=None, priority=50),
+
+        # GEMM
+        OpImpl(op_name="generic_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="te_general_grouped_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor=None, priority=50),
+
+        # Quantization
+        OpImpl(op_name="quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.quantize, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dequantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dequantize, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="bgrad_quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="split_quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.split_quantize, is_avail), vendor=None, priority=50),
+
+        # Activations - Forward
+        OpImpl(op_name="gelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.gelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="geglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.geglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="qgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.qgelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="qgeglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.qgeglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="relu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.relu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="reglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.reglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="srelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.srelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="sreglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.sreglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="silu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.silu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="swiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.swiglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="clamped_swiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor=None, priority=50),
+
+        # Activations - Backward
+        OpImpl(op_name="dgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dgelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dgeglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dgeglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dqgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dqgelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dqgeglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="drelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.drelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dreglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dreglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dsrelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dsrelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dsreglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dsreglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dsilu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dsilu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dswiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dswiglu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="clamped_dswiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor=None, priority=50),
+
+        # Activations - Bias + Backward
+        OpImpl(op_name="dbias_dgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dbias_dsilu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dbias_drelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dbias_dqgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dbias_dsrelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor=None, priority=50),
+
+        # Softmax
+        OpImpl(op_name="scaled_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor=None, priority=50),
+
+        # MOE operations
+        OpImpl(op_name="moe_permute_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="moe_permute_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="moe_unpermute_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="moe_unpermute_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor=None, priority=50),
+
+        # Fused attention
+        OpImpl(op_name="get_fused_attn_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_attn_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_attn_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fa_prepare_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fa_prepare_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor=None, priority=50),
+
+        # KV cache
+        OpImpl(op_name="copy_to_kv_cache", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor=None, priority=50),
+
+        # Tensor format conversions
+        OpImpl(op_name="convert_thd_to_bshd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="convert_bshd_to_thd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor=None, priority=50),
+
+        # RoPE (Rotary Position Embedding)
+        OpImpl(op_name="fused_rope_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_rope_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_qkv_rope_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_qkv_rope_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor=None, priority=50),
+
+        # TopK and MOE aux loss
+        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor=None, priority=50),
+
+        # Dropout
+        OpImpl(op_name="dropout_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="dropout_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor=None, priority=50),
+
+        # FP8 operations
+        OpImpl(op_name="fp8_transpose", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="swap_first_dims", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="compute_amax", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.compute_amax, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor=None, priority=50),
+
+        # Padding operations
+        OpImpl(op_name="fused_multi_row_padding", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="fused_multi_row_unpadding", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor=None, priority=50),
+
+        # Library version getters
+        OpImpl(op_name="get_cublasLt_version", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="get_cudnn_version", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="get_num_cublas_streams", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor=None, priority=50),
+
+        # THD (Tensor, Hidden, Dimension) operations
+        OpImpl(op_name="thd_read_half_tensor", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="thd_second_half_lse_correction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="thd_read_second_half_lse", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="thd_out_correction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="thd_grad_correction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="thd_get_partitioned_indices", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor=None, priority=50),
+
+        # NVSHMEM operations
+        OpImpl(op_name="init_nvshmem_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="create_nvshmem_tensor", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="nvshmem_finalize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor=None, priority=50),
+
+        # Multi-tensor optimizer operations
+        OpImpl(op_name="multi_tensor_scale", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_l2norm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_adam", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_sgd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor=None, priority=50),
+
+        # Communication overlap operations
+        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="create_fp8_tensor_meta", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="create_comm_overlap_helper", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="create_comm_overlap", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor=None, priority=50),
+        OpImpl(op_name="create_comm_overlap_p2p", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor=None, priority=50),
+
+        # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=50),
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/__init__.py b/transformer_engine/plugin/core/backends/vendor/__init__.py
new file mode 100644
index 0000000000..ce8eb210bb
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/__init__.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Vendor-specific backend implementations.
+
+This package contains hardware vendor-specific backend implementations
+for TransformerEngine-FL. Each vendor subdirectory should contain its
+own backend implementation.
+"""
+
+from __future__ import annotations
+
+import os
+
+_vendor_loading_errors = []
+
+try:
+    from ..._build_config import SKIP_CUDA_BUILD as _SKIP_CUDA_BUILD_CONFIG
+except ImportError:
+    _SKIP_CUDA_BUILD_CONFIG = bool(int(os.environ.get("TE_FL_SKIP_CUDA", "0")))
+    print(f"Build config not found, using env var: SKIP_CUDA_BUILD={_SKIP_CUDA_BUILD_CONFIG}")
+
+if os.environ.get("TE_FL_SKIP_CUDA"):
+    _SKIP_CUDA_BUILD = bool(int(os.environ.get("TE_FL_SKIP_CUDA", "0")))
+else:
+    _SKIP_CUDA_BUILD = _SKIP_CUDA_BUILD_CONFIG
+
+if not _SKIP_CUDA_BUILD:
+    try:
+        from .cuda import CUDABackend
+    except ImportError as e:
+        _vendor_loading_errors.append(("cuda", "ImportError", str(e)))
+        print(f"Failed to import CUDA vendor backend: {e}")
+    except Exception as e:
+        _vendor_loading_errors.append(("cuda", type(e).__name__, str(e)))
+        print(f"Error loading CUDA vendor backend: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+else:
+    print("CUDA vendor backend skipped (CUDA build was disabled at build time)")
+    _vendor_loading_errors.append(("cuda", "Skipped", "CUDA build was disabled at build time"))
+
+
+def get_vendor_loading_errors():
+    """Get errors that occurred during vendor backend loading."""
+    return _vendor_loading_errors.copy()
+
+
+__all__ = ["get_vendor_loading_errors"]
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py b/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py
new file mode 100644
index 0000000000..04b5335bea
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .cuda import CUDABackend
+
+__all__ = ["CUDABackend"]
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
new file mode 100644
index 0000000000..33cc4d5b68
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -0,0 +1,1104 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from ....ops import TEFLBackendBase, FP8TensorMeta
+
+def _load_cuda_libs():
+    import ctypes
+    import os
+    import subprocess
+    from pathlib import Path
+    import importlib.util
+    import sysconfig
+    import platform
+    import glob as glob_module
+
+    def get_ext():
+        system = platform.system()
+        return ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
+
+    ext = get_ext()
+
+    def try_load_lib(name, search_patterns):
+        for env_var in [f"{name.upper()}_HOME", f"{name.upper()}_PATH"]:
+            path = os.environ.get(env_var)
+            if path:
+                libs = glob_module.glob(f"{path}/**/lib{name}{ext}*", recursive=True)
+                if libs:
+                    libs.sort(reverse=True, key=os.path.basename)
+                    try:
+                        return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                    except:
+                        pass
+
+        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or "/usr/local/cuda"
+        for pattern in search_patterns:
+            libs = glob_module.glob(f"{cuda_home}/**/{pattern}", recursive=True)
+            if libs:
+                libs.sort(reverse=True, key=os.path.basename)
+                try:
+                    return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                except:
+                    pass
+
+        try:
+            result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
+            for line in result.decode().split('\n'):
+                if f"lib{name}" in line and "=>" in line:
+                    so_path = line.split(">")[1].strip()
+                    if so_path:
+                        return ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL)
+        except:
+            pass
+
+        try:
+            return ctypes.CDLL(f"lib{name}{ext}", mode=ctypes.RTLD_GLOBAL)
+        except:
+            return None
+
+    try:
+        try_load_lib("cudnn", [f"libcudnn{ext}*"])
+        try_load_lib("nvrtc", [f"libnvrtc{ext}*"])
+        try_load_lib("curand", [f"libcurand{ext}*"])
+
+        te_path = Path(importlib.util.find_spec("transformer_engine").origin).parent.parent
+        for search_dir in [te_path, te_path / "transformer_engine"]:
+            if search_dir.exists():
+                matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
+                if matches:
+                    ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL)
+                    return True
+        return False
+    except Exception as e:
+        print(f"[CUDA] Failed to load CUDA libs: {e}")
+        return False
+
+_cuda_libs_loaded = False
+
+def _ensure_cuda_libs():
+    global _cuda_libs_loaded
+    if not _cuda_libs_loaded:
+        _cuda_libs_loaded = _load_cuda_libs()
+    return _cuda_libs_loaded
+
+def _check_cuda_available() -> bool:
+    if not torch.cuda.is_available():
+        return False
+
+    import os
+    try:
+        from ...._build_config import SKIP_CUDA_BUILD
+        if SKIP_CUDA_BUILD:
+            print("[CUDA] Disabled: CUDA was skipped at build time")
+            return False
+    except ImportError:
+        if bool(int(os.environ.get("TE_FL_SKIP_CUDA", "0"))):
+            print("[CUDA] Disabled: TE_FL_SKIP_CUDA=1")
+            return False
+
+    try:
+        if not _ensure_cuda_libs():
+            return False
+        import transformer_engine_torch_nv
+        return True
+    except (ImportError, OSError) as e:
+        print(f"[CUDA] Import failed: {e}")
+        return False
+
+def _get_tex():
+    _ensure_cuda_libs()
+    import transformer_engine_torch_nv
+    return transformer_engine_torch_nv
+
+def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
+    if torch_dtype is None:
+        return None
+
+    NativeDType = tex_module.DType
+    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_torch_nv':
+        return torch_dtype
+
+    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
+        from transformer_engine.plugin.core.ops import DType as PyDType
+        if isinstance(torch_dtype, PyDType):
+            dtype_name = torch_dtype.name
+            if hasattr(NativeDType, dtype_name):
+                return getattr(NativeDType, dtype_name)
+
+    dtype_map = {
+        torch.float32: NativeDType.kFloat32,
+        torch.float16: NativeDType.kFloat16,
+        torch.bfloat16: NativeDType.kBFloat16,
+        torch.int32: NativeDType.kInt32,
+        torch.uint8: NativeDType.kByte,
+    }
+
+    if hasattr(torch, 'float8_e4m3fn'):
+        dtype_map[torch.float8_e4m3fn] = NativeDType.kFloat8E4M3
+    if hasattr(torch, 'float8_e5m2'):
+        dtype_map[torch.float8_e5m2] = NativeDType.kFloat8E5M2
+
+    return dtype_map.get(torch_dtype, torch_dtype)
+
+def _convert_dtype_params(func):
+    import functools
+    import inspect
+    import os
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        dtype_params = ['otype', 'output_dtype', 'bias_type']
+
+        from transformer_engine.plugin.core.ops import DType as PyDType
+
+        def needs_conversion(val):
+            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
+
+        for param_name in dtype_params:
+            if param_name in kwargs:
+                value = kwargs[param_name]
+                if needs_conversion(value):
+                    converted = self._to_te_dtype(value)
+                    kwargs[param_name] = converted
+
+        sig = inspect.signature(func)
+        param_names = list(sig.parameters.keys())[1:]
+
+        args_list = list(args)
+        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
+            if param_name in dtype_params and needs_conversion(arg_value):
+                converted = self._to_te_dtype(arg_value)
+                args_list[i] = converted
+
+        return func(self, *args_list, **kwargs)
+
+    return wrapper
+
+class CUDABackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return _check_cuda_available()
+
+    def __init__(self):
+        self._tex = None
+
+    def _get_tex(self):
+        if self._tex is None:
+            self._tex = _get_tex()
+        return self._tex
+
+    def _to_te_dtype(self, torch_dtype):
+        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
+
+    def is_available(self) -> bool:
+        return _check_cuda_available()
+
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionCUDA
+        return FlashAttentionCUDA
+
+    def quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        output: Optional[torch.Tensor] = None,
+        noop: Optional[torch.Tensor] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.quantize(tensor, quantizer, output, noop)
+
+    @_convert_dtype_params
+    def dequantize(
+        self,
+        input: torch.Tensor,
+        otype: torch.dtype,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dequantize(input, otype)
+
+    def bgrad_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.bgrad_quantize(input, quantizer)
+
+    @_convert_dtype_params
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        tex = self._get_tex()
+
+        if bias_type is None:
+            bias_type = self._to_te_dtype(torch.bfloat16)
+
+        return tex.generic_gemm(
+            A, transA, B, transB, D, quantizer, output_dtype,
+            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
+            accumulate, use_split_accumulator, comm_overlap, comm_type,
+            extra_output, bulk_overlap, alpha, beta
+        )
+
+    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm(*args, **kwargs)
+
+    def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.gelu(input, quantizer)
+
+    def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.geglu(input, quantizer)
+    def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgelu(input, quantizer)
+
+    def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgeglu(input, quantizer)
+    def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.relu(input, quantizer)
+
+    def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.reglu(input, quantizer)
+    def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.srelu(input, quantizer)
+
+    def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.sreglu(input, quantizer)
+
+    def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.silu(input, quantizer)
+
+    def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.swiglu(input, quantizer)
+    def clamped_swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
+    def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgelu(grad, fwd_input, quantizer)
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgeglu(grad, fwd_input, quantizer)
+
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgelu(grad, fwd_input, quantizer)
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgeglu(grad, fwd_input, quantizer)
+
+    def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.drelu(grad, fwd_input, quantizer)
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dreglu(grad, fwd_input, quantizer)
+
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsrelu(grad, fwd_input, quantizer)
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsreglu(grad, fwd_input, quantizer)
+
+    def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsilu(grad, fwd_input, quantizer)
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dswiglu(grad, fwd_input, quantizer)
+
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_drelu(grad, fwd_input, quantizer)
+
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dqgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsrelu(grad, fwd_input, quantizer)
+
+    @_convert_dtype_params
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, mu, rsigma = tex.layernorm_fwd(
+            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+        return y, mu, rsigma
+
+    def layernorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dgamma, dbeta
+
+    @_convert_dtype_params
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, y_quant, rsigma = tex.rmsnorm_fwd(
+            input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+            if y_quant is not None:
+                y_quant = y_quant.view(*orig_shape)
+        return y, y_quant, rsigma
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dw
+
+    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.rmsnorm_bwd_add(*args, **kwargs)
+
+    def multi_tensor_quantize(
+        self,
+        tensor_list: List[torch.Tensor],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
+    def split_quantize(
+        self,
+        tensor: torch.Tensor,
+        split_sections: List[int],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.split_quantize(tensor, split_sections, quantizer_list)
+
+    def moe_permute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_permute_fwd(*args, **kwargs)
+
+    def moe_permute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_permute_bwd(*args, **kwargs)
+
+    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_unpermute_fwd(*args, **kwargs)
+
+    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_unpermute_bwd(*args, **kwargs)
+
+    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+
+    def scaled_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale)
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
+
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
+
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        tex = self._get_tex()
+
+        args_list = list(args)
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+
+            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
+                return py_enum
+
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+
+            if hasattr(py_enum, 'value'):
+                enum_value = int(py_enum.value)
+                for member_name in dir(native_enum_class):
+                    if not member_name.startswith('_'):
+                        try:
+                            member = getattr(native_enum_class, member_name)
+                            if hasattr(member, 'value') and int(member.value) == enum_value:
+                                return member
+                        except:
+                            pass
+
+            if hasattr(py_enum, 'value'):
+                return int(py_enum.value)
+
+            return py_enum
+
+        if len(args) > 1:
+            args_list[1] = self._to_te_dtype(args[1])
+        if len(args) > 2:
+            args_list[2] = self._to_te_dtype(args[2])
+        if len(args) > 3:
+            args_list[3] = convert_enum(args[3], tex.NVTE_QKV_Layout)
+        if len(args) > 4:
+            args_list[4] = convert_enum(args[4], tex.NVTE_Bias_Type)
+        if len(args) > 5:
+            args_list[5] = convert_enum(args[5], tex.NVTE_Mask_Type)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_Softmax_Type)
+
+        return tex.get_fused_attn_backend(*args_list, **kwargs)
+
+    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
+                return py_enum
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+            return py_enum
+
+        args_list = list(args)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_QKV_Layout)
+        if len(args) > 7:
+            args_list[7] = convert_enum(args[7], tex.NVTE_Bias_Type)
+        if len(args) > 8:
+            args_list[8] = convert_enum(args[8], tex.NVTE_Mask_Type)
+        if len(args) > 9:
+            args_list[9] = convert_enum(args[9], tex.NVTE_Softmax_Type)
+
+        return tex.fused_attn_fwd(*args_list, **kwargs)
+
+    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
+                return py_enum
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+            return py_enum
+
+        args_list = list(args)
+        if len(args) > 5:
+            args_list[5] = convert_enum(args[5], tex.NVTE_QKV_Layout)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_Bias_Type)
+        if len(args) > 7:
+            args_list[7] = convert_enum(args[7], tex.NVTE_Mask_Type)
+        if len(args) > 8:
+            args_list[8] = convert_enum(args[8], tex.NVTE_Softmax_Type)
+        if len(args) > 19:
+            args_list[19] = self._to_te_dtype(args[19])
+
+        if 'dqkv_dtype' in kwargs:
+            kwargs['dqkv_dtype'] = self._to_te_dtype(kwargs['dqkv_dtype'])
+
+        return tex.fused_attn_bwd(*args_list, **kwargs)
+
+    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_fwd(*args, **kwargs)
+
+    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_bwd(*args, **kwargs)
+
+    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.copy_to_kv_cache(*args, **kwargs)
+
+    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_thd_to_bshd(*args, **kwargs)
+
+    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_bshd_to_thd(*args, **kwargs)
+
+    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_rope_forward(*args, **kwargs)
+
+    def fused_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_rope_backward(*args, **kwargs)
+
+    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_forward(*args, **kwargs)
+
+    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_backward(*args, **kwargs)
+
+    def fused_topk_with_score_function_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        num_groups: int,
+        group_topk: int,
+        scaling_factor: float,
+        score_function: Any,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_fwd(
+            logits, topk, use_pre_softmax, num_groups, group_topk,
+            scaling_factor, score_function, expert_bias
+        )
+
+    def fused_topk_with_score_function_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        routing_map: torch.Tensor,
+        intermediate_output: torch.Tensor,
+        grad_probs: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        scaling_factor: float,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_bwd(
+            num_tokens, num_experts, routing_map, intermediate_output,
+            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+        )
+
+    def fused_score_for_moe_aux_loss_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
+
+    def fused_score_for_moe_aux_loss_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        intermediate_output: torch.Tensor,
+        grad_scores: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_bwd(
+            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+        )
+
+    def fused_moe_aux_loss_fwd(
+        self,
+        probs: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        total_num_tokens: int,
+        num_experts: int,
+        num_rows: int,
+        num_cols: int,
+        topk: int,
+        coeff: float,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_fwd(
+            probs, tokens_per_expert, total_num_tokens, num_experts,
+            num_rows, num_cols, topk, coeff
+        )
+
+    def fused_moe_aux_loss_bwd(
+        self,
+        Const_buf: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        num_rows: int,
+        num_cols: int,
+        grad_aux_loss: torch.Tensor,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
+
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.dropout_fwd(input, dropout_probability, out)
+
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
+
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: Any,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_transpose(input, dtype, out=out)
+
+    def swap_first_dims(
+        self,
+        tensor: torch.Tensor,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.swap_first_dims(tensor, out=out)
+
+    def compute_amax(
+        self,
+        input: torch.Tensor,
+        amax: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.compute_amax(input, amax)
+
+    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
+
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
+
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: Any,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
+
+    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_padding(*args, **kwargs)
+
+    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(*args, **kwargs)
+
+    def get_cublasLt_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cublasLt_version()
+
+    def get_cudnn_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cudnn_version()
+
+    def get_num_cublas_streams(self) -> int:
+        tex = self._get_tex()
+        return tex.get_num_cublas_streams()
+
+    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_half_tensor(*args, **kwargs)
+
+    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_second_half_lse_correction(*args, **kwargs)
+
+    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_second_half_lse(*args, **kwargs)
+
+    def thd_out_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_out_correction(*args, **kwargs)
+
+    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_grad_correction(*args, **kwargs)
+
+    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_get_partitioned_indices(*args, **kwargs)
+
+    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.init_nvshmem_backend(*args, **kwargs)
+
+    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.create_nvshmem_tensor(*args, **kwargs)
+
+    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_send_on_current_stream(*args, **kwargs)
+
+    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_wait_on_current_stream(*args, **kwargs)
+
+    def nvshmem_finalize(self) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_finalize()
+
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        tex = self._get_tex()
+        if chunk_size is None:
+            return tex.multi_tensor_adam
+        tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
+            eps, step, mode, bias_correction, weight_decay
+        )
+
+    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
+
+    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_fp8(*args, **kwargs)
+
+    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable(*args, **kwargs)
+
+    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
+
+    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_sgd(*args, **kwargs)
+
+    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+
+    def bulk_overlap_ag_with_external_gemm(
+        self,
+        allgather_communicator: Any,
+        send_stream: Any,
+        recv_stream: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        tex = self._get_tex()
+        return tex.FP8TensorMeta()
+
+    def create_comm_overlap_helper(
+        self,
+        world_group: Optional[Any] = None,
+        intra_node_group: Optional[Any] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        if world_group is None:
+            return tex.CommOverlapHelper()
+        return tex.CommOverlapHelper(world_group, intra_node_group)
+
+    def create_comm_overlap(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        num_splits: int = 3,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 2,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 16,
+        set_sm_margin: bool = True,
+        atomic_gemm: bool = False,
+        rs_overlap_first_gemm: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlap(
+            buffer_shape, buffer_dtype, helper, tp_size,
+            num_splits, num_max_streams, comm_cga_size,
+            gemm_priority, comm_priority, num_comm_sm,
+            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+        )
+
+    def create_comm_overlap_p2p(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        comm_type: Any,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 1,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 1,
+        set_sm_margin: bool = False,
+        atomic_gemm: bool = False,
+        use_ce: bool = True,
+        aggregate: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlapP2P(
+            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
+            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
+            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
new file mode 100644
index 0000000000..9a972a07d2
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+
+
+class FlashAttentionCUDA(FlashAttentionBase):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+        # Store initialization parameters for lazy loading
+        self._init_params = {
+            'softmax_scale': softmax_scale,
+            'attention_dropout': attention_dropout,
+            'attention_dropout_ctx': attention_dropout_ctx or nullcontext,
+            'attention_type': attention_type,
+            'layer_number': layer_number,
+            'deterministic': deterministic,
+        }
+        self._native_flash_attn = None
+
+    def _ensure_native_flash_attn(self):
+        """Lazy initialization of native FlashAttention."""
+        if self._native_flash_attn is not None:
+            return
+
+        try:
+            # Import here to avoid circular dependency issues
+            # transformer_engine_torch must be registered before this import
+            from transformer_engine.pytorch.attention.dot_product_attention.backends import (
+                FlashAttention as FlashAttentionNative,
+            )
+
+            if FlashAttentionNative is None:
+                raise RuntimeError("FlashAttention class is None - flash-attn may not be installed correctly")
+
+            self._native_flash_attn = FlashAttentionNative(**self._init_params)
+
+        except ImportError as e:
+            raise RuntimeError(
+                f"Failed to import native FlashAttention: {e}. "
+                "Please ensure flash-attn is installed and transformer_engine_torch is available."
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to initialize native FlashAttention: {e}. "
+                f"Init params: {self._init_params}"
+            )
+
+    @property
+    def backend_name(self) -> str:
+        return "cuda"
+
+    def forward(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        # Ensure native flash attention is initialized
+        self._ensure_native_flash_attn()
+
+        return self._native_flash_attn(
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            qkv_layout=qkv_layout,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_kv=max_seqlen_kv,
+            attn_mask_type=attn_mask_type,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            cp_group=cp_group,
+            cp_global_ranks=cp_global_ranks,
+            cp_stream=cp_stream,
+            cp_comm_type=cp_comm_type,
+            fp8=fp8,
+            fp8_meta=fp8_meta,
+            quantizers=quantizers,
+            inference_params=inference_params,
+            flash_attention_backend=flash_attention_backend,
+            fp8_output=fp8_output,
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
new file mode 100644
index 0000000000..eea8999ae9
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+CUDA vendor backend operator registrations.
+
+This module registers all VENDOR (CUDA) implementations from transformer_engine_torch.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ....types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all CUDA (VENDOR) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    # Import CUDA backend to get all the wrapped tex functions
+    from .cuda import CUDABackend
+
+    # Create a backend instance to access the methods
+    backend = CUDABackend()
+
+    # Check if CUDA is available before registering
+    if not backend.is_available():
+        return
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # Normalization
+        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="layernorm_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="layernorm_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="CUDA", priority=100),
+
+        # GEMM
+        OpImpl(op_name="generic_gemm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="CUDA", priority=100),
+
+        # Quantization
+        OpImpl(op_name="quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dequantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="bgrad_quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="split_quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="CUDA", priority=100),
+
+        # Activations - Forward
+        OpImpl(op_name="gelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="geglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="qgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="qgeglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="relu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="reglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="srelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="sreglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="silu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="swiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="clamped_swiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="CUDA", priority=100),
+
+        # Activations - Backward
+        OpImpl(op_name="dgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dgeglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dqgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dqgeglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="drelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dreglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dsrelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dsreglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dsilu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dswiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="CUDA", priority=100),
+
+        # Activations - Bias + Backward
+        OpImpl(op_name="dbias_dgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dbias_dsilu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dbias_drelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="CUDA", priority=100),
+
+        # Softmax
+        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="CUDA", priority=100),
+
+        # MOE operations
+        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="CUDA", priority=100),
+
+        # Fused attention
+        OpImpl(op_name="get_fused_attn_backend", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_attn_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_attn_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="CUDA", priority=100),
+
+        # KV cache
+        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="CUDA", priority=100),
+
+        # Tensor format conversions
+        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="CUDA", priority=100),
+
+        # RoPE (Rotary Position Embedding)
+        OpImpl(op_name="fused_rope_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_rope_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="CUDA", priority=100),
+
+        # TopK and MOE aux loss
+        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="CUDA", priority=100),
+
+        # Dropout
+        OpImpl(op_name="dropout_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="dropout_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="CUDA", priority=100),
+
+        # FP8 operations
+        OpImpl(op_name="fp8_transpose", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="swap_first_dims", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="compute_amax", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="CUDA", priority=100),
+
+        # Padding operations
+        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="CUDA", priority=100),
+
+        # Library version getters
+        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="get_cudnn_version", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="CUDA", priority=100),
+
+        # THD (Tensor, Hidden, Dimension) operations
+        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="thd_out_correction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="thd_grad_correction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="CUDA", priority=100),
+
+        # NVSHMEM operations
+        OpImpl(op_name="init_nvshmem_backend", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="create_nvshmem_tensor", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="nvshmem_finalize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor="CUDA", priority=100),
+
+        # Multi-tensor operations
+        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="CUDA", priority=100),
+
+        # Communication overlap operations
+        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="create_comm_overlap", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="CUDA", priority=100),
+        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="CUDA", priority=100),
+
+        # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="CUDA", priority=100),
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
new file mode 100644
index 0000000000..408e6ed8c1
--- /dev/null
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Built-in operator implementations registration.
+
+This module registers DEFAULT (FlagOS) and REFERENCE (PyTorch) implementations
+for all supported operators by calling register_builtins from each backend.
+"""
+
+from __future__ import annotations
+
+from .registry import OpRegistry
+
+
+def register_builtins(registry: OpRegistry) -> None:
+    """
+    Register all built-in operator implementations.
+
+    This function registers:
+    - DEFAULT implementations (FlagOS/flag_gems)
+    - REFERENCE implementations (PyTorch)
+    - VENDOR implementations (CUDA, if available)
+
+    Args:
+        registry: Registry to register into
+    """
+    # Register FlagOS (DEFAULT) implementations
+    try:
+        from .backends.flagos.register_ops import register_builtins as register_flagos
+        register_flagos(registry)
+    except Exception as e:
+        print(f"[WARNING] Failed to register FlagOS operators: {e}")
+    
+    # Register PyTorch (REFERENCE) implementations
+    try:
+        from .backends.reference.register_ops import register_builtins as register_reference
+        register_reference(registry)
+    except Exception as e:
+        print(f"[WARNING] Failed to register Reference operators: {e}")
+    
+    # Register CUDA (VENDOR) implementations
+    try:
+        from .backends.vendor.cuda.register_ops import register_builtins as register_cuda
+        register_cuda(registry)
+    except Exception as e:
+        # CUDA may not be available, this is expected
+        pass
diff --git a/transformer_engine/plugin/core/discovery.py b/transformer_engine/plugin/core/discovery.py
new file mode 100644
index 0000000000..cc6280eda7
--- /dev/null
+++ b/transformer_engine/plugin/core/discovery.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from __future__ import annotations
+
+import importlib
+import os
+import sys
+from typing import Any, Callable, List, Optional, Tuple
+
+from .logger_manager import get_logger
+
+PLUGIN_GROUP = "te_fl.plugin"
+
+PLUGIN_MODULES_ENV = "TE_FL_PLUGIN_MODULES"
+
+logger = get_logger()
+
+_discovered_plugin: List[Tuple[str, str, bool]] = []
+
+def _log_debug(msg: str) -> None:
+    logger.debug(msg)
+
+def _log_info(msg: str) -> None:
+    logger.info(msg)
+
+def _log_warning(msg: str) -> None:
+    logger.warning(msg)
+
+def _log_error(msg: str) -> None:
+    logger.error(msg)
+
+def _get_entry_points():
+    try:
+        from importlib.metadata import entry_points
+    except ImportError:
+        try:
+            from importlib_metadata import entry_points
+        except ImportError:
+            _log_debug("importlib.metadata not available, skipping entry points discovery")
+            return []
+
+    try:
+        eps = entry_points()
+
+        if hasattr(eps, "select"):
+            return list(eps.select(group=PLUGIN_GROUP))
+
+        if isinstance(eps, dict):
+            return eps.get(PLUGIN_GROUP, [])
+
+        if hasattr(eps, "get"):
+            return eps.get(PLUGIN_GROUP, [])
+
+        return []
+
+    except Exception as e:
+        _log_warning(f"Error accessing entry points: {e}")
+        return []
+
+def _call_register_function(
+    obj: Any,
+    registry_module: Any,
+    source_name: str,
+) -> bool:
+    if callable(obj) and not isinstance(obj, type):
+        try:
+            obj(registry_module)
+            _log_info(f"Registered plugin from {source_name} (direct callable)")
+            return True
+        except Exception as e:
+            _log_error(f"Error calling plugin {source_name}: {e}")
+            return False
+
+    register_fn = getattr(obj, "te_fl_register", None) or getattr(obj, "register", None)
+
+    if callable(register_fn):
+        try:
+            register_fn(registry_module)
+            _log_info(f"Registered plugin from {source_name}")
+            return True
+        except Exception as e:
+            _log_error(f"Error calling register function in {source_name}: {e}")
+            return False
+
+    _log_debug(f"No register function found in {source_name}")
+    return False
+
+def discover_from_entry_points(registry_module: Any) -> int:
+    loaded = 0
+    entry_points_list = _get_entry_points()
+
+    if not entry_points_list:
+        _log_debug("No entry points found for group: " + PLUGIN_GROUP)
+        return 0
+
+    _log_debug(f"Found {len(entry_points_list)} entry points")
+
+    for ep in entry_points_list:
+        ep_name = getattr(ep, "name", str(ep))
+        try:
+            _log_debug(f"Loading entry point: {ep_name}")
+            obj = ep.load()
+
+            if _call_register_function(obj, registry_module, f"entry_point:{ep_name}"):
+                _discovered_plugin.append((ep_name, "entry_point", True))
+                loaded += 1
+            else:
+                _discovered_plugin.append((ep_name, "entry_point", False))
+
+        except Exception as e:
+            _log_error(f"Failed to load entry point {ep_name}: {e}")
+            _discovered_plugin.append((ep_name, "entry_point", False))
+
+    return loaded
+
+def discover_from_env_modules(registry_module: Any) -> int:
+    modules_str = os.environ.get(PLUGIN_MODULES_ENV, "").strip()
+
+    if not modules_str:
+        return 0
+
+    loaded = 0
+    module_names = [m.strip() for m in modules_str.split(",") if m.strip()]
+
+    _log_debug(f"Loading plugin from env var: {module_names}")
+
+    for mod_name in module_names:
+        try:
+            _log_debug(f"Importing module: {mod_name}")
+            mod = importlib.import_module(mod_name)
+
+            if _call_register_function(mod, registry_module, f"env_module:{mod_name}"):
+                _discovered_plugin.append((mod_name, "env_module", True))
+                loaded += 1
+            else:
+                _discovered_plugin.append((mod_name, "env_module", False))
+
+        except ImportError as e:
+            _log_error(f"Failed to import plugin module {mod_name}: {e}")
+            _discovered_plugin.append((mod_name, "env_module", False))
+        except Exception as e:
+            _log_error(f"Error loading plugin module {mod_name}: {e}")
+            _discovered_plugin.append((mod_name, "env_module", False))
+
+    return loaded
+
+def discover_plugin(registry_module: Any) -> int:
+    """
+    Main plugin discovery function.
+
+    Discovers and registers plugin from:
+    1. Entry points (group: 'te_fl.plugin')
+    2. Environment variable modules (TE_FL_PLUGIN_MODULES)
+
+    Args:
+        registry_module: OpRegistry instance to register plugin to
+
+    Returns:
+        Number of successfully loaded plugin
+    """
+    if registry_module is None:
+        _log_warning("Registry module is None, skipping plugin discovery")
+        return 0
+
+    _log_debug("Starting plugin discovery...")
+
+    total = 0
+
+    total += discover_from_entry_points(registry_module)
+
+    total += discover_from_env_modules(registry_module)
+
+    _log_debug(f"Plugin discovery complete. Loaded {total} plugin.")
+
+    return total
+
+# Alias for compatibility with different naming conventions
+discover_op_plugin = discover_plugin
+
+def get_discovered_plugin() -> List[Tuple[str, str, bool]]:
+    """Get list of discovered plugin (name, source, success)"""
+    return _discovered_plugin.copy()
+
+def clear_discovered_plugin() -> None:
+    """Clear the discovered plugin list (for testing)"""
+    _discovered_plugin.clear()
+
+
diff --git a/transformer_engine/plugin/core/logger_manager.py b/transformer_engine/plugin/core/logger_manager.py
new file mode 100644
index 0000000000..9d13aa2f63
--- /dev/null
+++ b/transformer_engine/plugin/core/logger_manager.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import logging
+import sys
+import os
+import threading
+
+class Logger:
+    def __init__(self, name, level=logging.INFO):
+        self.logger = logging.getLogger(name)
+        self.logger.setLevel(level)
+        self.logger.propagate = False
+        for handler in self.logger.handlers[:]:
+            self.logger.removeHandler(handler)
+
+        formatter = logging.Formatter(
+            "[%(asctime)s %(name)s %(filename)s:%(lineno)d %(levelname)s] %(message)s"
+        )
+
+        stream_handler = logging.StreamHandler(sys.stdout)
+        stream_handler.setFormatter(formatter)
+
+        self.logger.addHandler(stream_handler)
+        self._printed_once = set()
+
+    def info(self, message):
+        self.logger.info(message, stacklevel=2)
+
+    def warning(self, message):
+        self.logger.warning(message, stacklevel=2)
+
+    def error(self, message):
+        self.logger.error(message, stacklevel=2)
+
+    def critical(self, message):
+        self.logger.critical(message, stacklevel=2)
+
+    def debug(self, message):
+        self.logger.debug(message, stacklevel=2)
+
+    def info_once(self, message):
+        if message not in self._printed_once:
+            self._printed_once.add(message)
+            self.logger.info(message, stacklevel=2)
+
+    def warning_once(self, message):
+        if message not in self._printed_once:
+            self._printed_once.add(message)
+            self.logger.warning(message, stacklevel=2)
+
+    def debug_once(self, message):
+        if message not in self._printed_once:
+            self._printed_once.add(message)
+            self.logger.debug(message, stacklevel=2)
+
+class LoggerManager:
+    _instance = None
+    _lock = threading.Lock()
+
+    def __init__(self):
+        if hasattr(self, '_global_logger'):
+            return
+
+        self._global_logger = None
+        self._global_printed_once = set()
+        self._printed_once_lock = threading.Lock()
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls.__new__(cls)
+                    cls._instance.__init__()
+        return cls._instance
+
+    def get_logger(self):
+        if self._global_logger is None:
+            with self._lock:
+                if self._global_logger is None:
+                    level = os.getenv("TEFL_LOG_LEVEL", "INFO").upper()
+                    self._global_logger = Logger("TE-FL", level)
+        return self._global_logger
+
+    def print_once(self, message):
+        with self._printed_once_lock:
+            if message not in self._global_printed_once:
+                self._global_printed_once.add(message)
+                print(message)
+
+    def debug_print_once(self, func_name: str, backend_name: str = "Backend", *args, **kwargs):
+        key = f"{backend_name}.{func_name}"
+
+        with self._printed_once_lock:
+            if key not in self._global_printed_once:
+                self._global_printed_once.add(key)
+                print(f"[{backend_name}] Calling {func_name}")
+                if args:
+                    print(f"  args: {[type(a).__name__ for a in args[:5]]}...")
+                if kwargs:
+                    print(f"  kwargs: {list(kwargs.keys())[:5]}...")
+                print(f"[{backend_name}] {func_name} completed successfully")
+
+    def reset(self):
+        with self._lock:
+            with self._printed_once_lock:
+                self._global_logger = None
+                self._global_printed_once.clear()
+
+def get_logger():
+    return LoggerManager.get_instance().get_logger()
+
+def print_once(message):
+    LoggerManager.get_instance().print_once(message)
+
+def debug_print_once(func_name: str, backend_name: str = "Backend", *args, **kwargs):
+    LoggerManager.get_instance().debug_print_once(func_name, backend_name, *args, **kwargs)
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/manager.py b/transformer_engine/plugin/core/manager.py
new file mode 100644
index 0000000000..51a532f7ec
--- /dev/null
+++ b/transformer_engine/plugin/core/manager.py
@@ -0,0 +1,478 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from __future__ import annotations
+
+import os
+import threading
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple
+
+from .discovery import discover_plugin
+from .registry import OpRegistry
+from .policy import SelectionPolicy, get_policy
+from .types import OpImpl, BackendImplKind, match_token
+from .logger_manager import get_logger
+
+logger = get_logger()
+
+
+@dataclass
+class _OpManagerState:
+    """Internal state for OpManager"""
+    init_pid: int = -1
+    initialized: bool = False
+    policy_epoch: int = 0
+
+
+class OpManager:
+    """
+    Main manager for operator dispatching and selection.
+
+    Responsibilities:
+    - Lazy initialization and plugin discovery
+    - Multi-process safety (PID detection + at_fork)
+    - Policy-based operator selection
+    - Dispatch caching with invalidation
+    """
+
+    def __init__(self, registry: Optional[OpRegistry] = None) -> None:
+        self._lock = threading.RLock()
+        self._registry = registry or OpRegistry()
+        self._state = _OpManagerState()
+        self._dispatch_cache: Dict[Tuple[str, str, int], Callable] = {}
+        self._called_ops: Dict[str, str] = {}  # Map op_name -> last_used_impl_id (for logging)
+
+        # Register at_fork handler for multi-process safety
+        try:
+            os.register_at_fork(after_in_child=self._reset_after_fork)
+        except AttributeError:
+            # os.register_at_fork not available (Windows)
+            pass
+
+    @property
+    def registry(self) -> OpRegistry:
+        """Get the underlying operator registry"""
+        return self._registry
+
+    def _reset_after_fork(self) -> None:
+        """Reset state after process fork"""
+        with self._lock:
+            self._state.initialized = False
+            self._state.init_pid = -1
+            self._state.policy_epoch += 1
+            self._dispatch_cache.clear()
+            self._called_ops.clear()
+            logger.debug("OpManager reset after fork")
+
+    def bump_policy_epoch(self) -> None:
+        """
+        Increment policy epoch to invalidate dispatch cache.
+
+        Call this when policy changes at runtime.
+        """
+        with self._lock:
+            self._state.policy_epoch += 1
+            self._dispatch_cache.clear()
+            logger.debug(f"Policy epoch bumped to {self._state.policy_epoch}")
+
+    def ensure_initialized(self) -> None:
+        """
+        Ensure the manager is initialized in the current process.
+
+        Performs:
+        1. PID check (multi-process safety)
+        2. Register built-in operator implementations
+        3. Discover and register plugin
+        """
+        with self._lock:
+            pid = os.getpid()
+
+            # Check if already initialized in this process
+            if self._state.initialized and self._state.init_pid == pid:
+                return
+
+            logger.debug(f"Initializing OpManager in PID {pid}")
+
+            # Mark as initialized
+            self._state.initialized = True
+            self._state.init_pid = pid
+
+            # Register built-in operators
+            from . import builtin_ops
+            builtin_ops.register_builtins(self._registry)
+
+            # Discover and register plugin
+            discover_plugin(self._registry)
+
+            # Invalidate cache
+            self._state.policy_epoch += 1
+            self._dispatch_cache.clear()
+
+            # Print initialization summary
+            snap = self._registry.snapshot()
+            total_ops = len(snap.impls_by_op)
+            total_impls = sum(len(impls) for impls in snap.impls_by_op.values())
+
+            logger.info(f"OpManager initialized: {total_ops} ops with {total_impls} implementations")
+
+            # Group implementations by kind for summary
+            vendor_count = sum(1 for impls in snap.impls_by_op.values()
+                             for impl in impls if impl.kind == BackendImplKind.VENDOR)
+            reference_count = sum(1 for impls in snap.impls_by_op.values()
+                                for impl in impls if impl.kind == BackendImplKind.REFERENCE)
+            default_count = sum(1 for impls in snap.impls_by_op.values()
+                              for impl in impls if impl.kind == BackendImplKind.DEFAULT)
+
+            logger.debug(f"  Vendor: {vendor_count}, Default: {default_count}, Reference: {reference_count}")
+
+            # List all registered impl_ids
+            if logger.logger.isEnabledFor(logger.logger.level):
+                impl_ids = sorted(set(impl.impl_id for impls in snap.impls_by_op.values() for impl in impls))
+                logger.info(f"Registered impl_ids: {impl_ids}")
+
+    def _matches_vendor_filters(self, impl: OpImpl, policy: SelectionPolicy) -> bool:
+        """Check if implementation matches policy vendor filters"""
+        if impl.kind != BackendImplKind.VENDOR:
+            return True
+
+        if impl.vendor is None:
+            return False
+
+        # Check deny list
+        if impl.vendor in policy.deny_vendors:
+            return False
+
+        # Check allow list (if specified)
+        if policy.allow_vendors is not None and impl.vendor not in policy.allow_vendors:
+            return False
+
+        return True
+
+    def _default_order(self, policy: SelectionPolicy) -> list[str]:
+        """Get default selection order based on policy"""
+        return policy.get_default_order()
+
+    def resolve(self, op_name: str) -> Callable:
+        """
+        Resolve and return the best implementation for an operator.
+
+        Selection process:
+        1. Check dispatch cache
+        2. Get all registered implementations
+        3. Filter by policy (vendor allow/deny)
+        4. Filter by availability (is_available())
+        5. Select best match using per-op order or default order
+        6. Cache the result
+
+        Args:
+            op_name: Name of the operator to resolve
+
+        Returns:
+            Callable implementation function
+
+        Raises:
+            RuntimeError: If no implementation found
+        """
+        self.ensure_initialized()
+
+        policy = get_policy()
+        policy_fp = policy.fingerprint()
+        epoch = self._state.policy_epoch
+
+        # Check cache
+        cache_key = (op_name, policy_fp, epoch)
+        cached = self._dispatch_cache.get(cache_key)
+        if cached is not None:
+            return cached
+
+        # Get all implementations for this operator
+        snap = self._registry.snapshot()
+        candidates = list(snap.impls_by_op.get(op_name, []))
+
+        # Filter by vendor policy
+        candidates = [c for c in candidates if self._matches_vendor_filters(c, policy)]
+
+        # Filter by availability
+        available: list[OpImpl] = []
+        for c in candidates:
+            try:
+                if c.is_available():
+                    available.append(c)
+                else:
+                    logger.debug(f"Implementation {c.impl_id} not available for op={op_name}")
+            except Exception as e:
+                logger.warning(f"Error checking availability of {c.impl_id}: {e}")
+                continue
+
+        candidates = available
+
+        if not candidates:
+            raise RuntimeError(
+                f"No available implementation for op='{op_name}'. "
+                f"Registered: {[impl.impl_id for impl in snap.impls_by_op.get(op_name, [])]}"
+            )
+
+        # Get selection order (per-op or default)
+        order = policy.per_op_order_dict.get(op_name) or self._default_order(policy)
+
+        # Select best implementation
+        chosen: Optional[OpImpl] = None
+        for token in order:
+            matches = [c for c in candidates if match_token(c, token)]
+            if not matches:
+                continue
+
+            # Sort by priority (higher first), then by impl_id for stability
+            matches.sort(key=lambda x: (x.priority, x.impl_id), reverse=True)
+            chosen = matches[0]
+            break
+
+        if chosen is None:
+            if policy.strict:
+                raise RuntimeError(
+                    f"No implementation available for op='{op_name}' under strict policy. "
+                    f"Candidates: {[c.impl_id for c in candidates]}"
+                )
+            raise RuntimeError(
+                f"No implementation selected for op='{op_name}'. "
+                f"Candidates: {[c.impl_id for c in candidates]}, Order: {order}"
+            )
+
+        # Cache the result
+        self._dispatch_cache[cache_key] = chosen.fn
+        return chosen.fn
+
+    def resolve_candidates(self, op_name: str) -> list[OpImpl]:
+        """
+        Resolve and return all available implementations for an operator,
+        sorted by priority (highest first).
+
+        This is similar to resolve() but returns all viable candidates
+        instead of just the best one. Useful for fallback mechanisms.
+
+        Args:
+            op_name: Name of the operator to resolve
+
+        Returns:
+            List of OpImpl sorted by priority (highest first)
+
+        Raises:
+            RuntimeError: If no implementation found
+        """
+        self.ensure_initialized()
+
+        policy = get_policy()
+
+        # Get all implementations for this operator
+        snap = self._registry.snapshot()
+        candidates = list(snap.impls_by_op.get(op_name, []))
+
+        # Filter by vendor policy
+        candidates = [c for c in candidates if self._matches_vendor_filters(c, policy)]
+
+        # Filter by availability
+        available: list[OpImpl] = []
+        for c in candidates:
+            try:
+                if c.is_available():
+                    available.append(c)
+                else:
+                    logger.debug(f"Implementation {c.impl_id} not available for op={op_name}")
+            except Exception as e:
+                logger.warning(f"Error checking availability of {c.impl_id}: {e}")
+                continue
+
+        candidates = available
+
+        if not candidates:
+            raise RuntimeError(
+                f"No available implementation for op='{op_name}'. "
+                f"Registered: {[impl.impl_id for impl in snap.impls_by_op.get(op_name, [])]}"
+            )
+
+        # Get selection order (per-op or default)
+        order = policy.per_op_order_dict.get(op_name) or self._default_order(policy)
+
+        # Sort candidates by order tokens, then by priority
+        sorted_candidates: list[OpImpl] = []
+        for token in order:
+            matches = [c for c in candidates if match_token(c, token)]
+            if matches:
+                # Sort by priority (higher first), then by impl_id for stability
+                matches.sort(key=lambda x: (x.priority, x.impl_id), reverse=True)
+                sorted_candidates.extend(matches)
+
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_candidates = []
+        for c in sorted_candidates:
+            if c.impl_id not in seen:
+                seen.add(c.impl_id)
+                unique_candidates.append(c)
+
+        if not unique_candidates:
+            raise RuntimeError(
+                f"No implementation selected for op='{op_name}'. "
+                f"Candidates: {[c.impl_id for c in candidates]}, Order: {order}"
+            )
+
+        return unique_candidates
+
+    def call(self, op_name: str, *args, **kwargs):
+        """
+        Resolve and call an operator implementation with optional fallback support.
+
+        When TE_FL_STRICT=1, this method will try alternative implementations
+        if the primary one fails. Otherwise, it behaves like the original implementation.
+
+        Logs on first call or when the implementation changes (e.g., backend switch).
+
+        Args:
+            op_name: Name of the operator
+            *args, **kwargs: Arguments passed to the implementation
+
+        Returns:
+            Result from the implementation
+
+        Raises:
+            RuntimeError: If all implementations fail (when fallback enabled) or
+                         if the primary implementation fails (when fallback disabled)
+        """
+        enable_fallback = os.getenv("TE_FL_STRICT", "1") != "0"
+
+        if not enable_fallback:
+            # Original behavior: use cached resolve() and fast-fail
+            fn = self.resolve(op_name)
+
+            # Get current impl_id to check if it changed
+            impl_id = self.get_selected_impl_id(op_name)
+            last_impl_id = self._called_ops.get(op_name)
+
+            # Log if first call or implementation changed
+            if last_impl_id != impl_id:
+                with self._lock:
+                    # Double-check after acquiring lock
+                    if self._called_ops.get(op_name) != impl_id:
+                        snap = self._registry.snapshot()
+                        for impl in snap.impls_by_op.get(op_name, []):
+                            if impl.impl_id == impl_id:
+                                if last_impl_id is None:
+                                    logger.info(
+                                        f"Op '{op_name}' using '{impl_id}' "
+                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                    )
+                                else:
+                                    logger.info(
+                                        f"Op '{op_name}' switched from '{last_impl_id}' to '{impl_id}' "
+                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                    )
+                                break
+                        self._called_ops[op_name] = impl_id
+
+            return fn(*args, **kwargs)
+
+        # Fallback mode: try candidates in priority order
+        candidates = self.resolve_candidates(op_name)
+        last_error = None
+
+        for idx, impl in enumerate(candidates):
+            try:
+                # Log primary implementation or fallback attempts
+                if idx == 0:
+                    # Primary implementation
+                    last_impl_id = self._called_ops.get(op_name)
+                    if last_impl_id != impl.impl_id:
+                        with self._lock:
+                            if self._called_ops.get(op_name) != impl.impl_id:
+                                if last_impl_id is None:
+                                    logger.info(
+                                        f"Op '{op_name}' using '{impl.impl_id}' "
+                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                    )
+                                else:
+                                    logger.info(
+                                        f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
+                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                    )
+                                self._called_ops[op_name] = impl.impl_id
+                else:
+                    # Always log fallback attempts (these are important runtime events)
+                    logger.info(
+                        f"Op '{op_name}' fallback to '{impl.impl_id}' "
+                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                    )
+
+                result = impl.fn(*args, **kwargs)
+
+                # Update tracked impl_id on success (for fallback case)
+                if idx > 0:
+                    with self._lock:
+                        self._called_ops[op_name] = impl.impl_id
+
+                return result
+
+            except Exception as e:
+                last_error = e
+                if idx < len(candidates) - 1:
+                    # Not the last candidate, log warning and try next
+                    logger.warning(
+                        f"Implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
+                    )
+                else:
+                    # Last candidate failed, log error
+                    logger.error(
+                        f"Last implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
+                    )
+
+        # All implementations failed
+        raise RuntimeError(
+            f"All {len(candidates)} implementation(s) failed for op='{op_name}'. "
+            f"Last error: {last_error}"
+        ) from last_error
+
+    def get_selected_impl_id(self, op_name: str) -> str:
+        """
+        Get the impl_id of the currently selected implementation.
+
+        Args:
+            op_name: Name of the operator
+
+        Returns:
+            Implementation ID string
+        """
+        fn = self.resolve(op_name)
+
+        # Try to find the impl by function identity
+        snap = self._registry.snapshot()
+        for impl in snap.impls_by_op.get(op_name, []):
+            if impl.fn is fn:
+                return impl.impl_id
+
+        return "unknown"
+
+
+# Global default instance
+_default_manager: Optional[OpManager] = None
+_manager_lock = threading.RLock()
+
+
+def get_default_manager() -> OpManager:
+    """Get or create the global default OpManager instance"""
+    global _default_manager
+
+    if _default_manager is None:
+        with _manager_lock:
+            if _default_manager is None:
+                _default_manager = OpManager()
+
+    return _default_manager
+
+
+def reset_default_manager() -> None:
+    """Reset the global default OpManager (useful for testing)"""
+    global _default_manager
+
+    with _manager_lock:
+        _default_manager = None
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
new file mode 100644
index 0000000000..24d89fb65c
--- /dev/null
+++ b/transformer_engine/plugin/core/ops.py
@@ -0,0 +1,1338 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Type
+from enum import IntEnum
+from contextlib import nullcontext
+
+import torch
+
+class DType(IntEnum):
+    kByte = 0
+    kInt32 = 2
+    kFloat32 = 4
+    kFloat16 = 5
+    kBFloat16 = 6
+    kFloat8E4M3 = 7
+    kFloat8E5M2 = 8
+    kFloat4E2M1 = 10
+
+class Float8BlockScaleTensorFormat(IntEnum):
+    COMPACT = 0
+    GEMM_READY = 1
+
+class NVTE_Activation_Type(IntEnum):
+    NVTE_GELU = 0
+    NVTE_GEGLU = 1
+    NVTE_SILU = 2
+    NVTE_SWIGLU = 3
+    NVTE_RELU = 4
+    NVTE_REGLU = 5
+    NVTE_QGELU = 6
+    NVTE_QGEGLU = 7
+    NVTE_SRELU = 8
+    NVTE_SREGLU = 9
+
+class NVTE_Softmax_Type(IntEnum):
+    NVTE_VANILLA_SOFTMAX = 0
+    NVTE_OFF_BY_ONE_SOFTMAX = 1
+    NVTE_LEARNABLE_SOFTMAX = 2
+
+class CommGemmOverlapRole(IntEnum):
+    INPUT = 0
+    OUTPUT = 1
+
+class FP8FwdTensors(IntEnum):
+    GEMM1_INPUT = 0
+    GEMM1_WEIGHT = 1
+    GEMM1_OUTPUT = 2
+    GEMM2_INPUT = 3
+    GEMM2_WEIGHT = 4
+    GEMM2_OUTPUT = 5
+    GEMM3_INPUT = 6
+    GEMM3_WEIGHT = 7
+    GEMM3_OUTPUT = 8
+
+class FP8BwdTensors(IntEnum):
+    GRAD_OUTPUT1 = 0
+    GRAD_INPUT1 = 1
+    GRAD_OUTPUT2 = 2
+    GRAD_INPUT2 = 3
+    GRAD_OUTPUT3 = 4
+    GRAD_INPUT3 = 5
+
+class NVTE_Bias_Type(IntEnum):
+    NVTE_NO_BIAS = 0
+    NVTE_PRE_SCALE_BIAS = 1
+    NVTE_POST_SCALE_BIAS = 2
+    NVTE_ALIBI = 3
+
+class NVTE_Mask_Type(IntEnum):
+    NVTE_NO_MASK = 0
+    NVTE_PADDING_MASK = 1
+    NVTE_CAUSAL_MASK = 2
+    NVTE_PADDING_CAUSAL_MASK = 3
+    NVTE_CAUSAL_BOTTOM_RIGHT_MASK = 4
+    NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK = 5
+    NVTE_ARBITRARY_MASK = 6
+
+class NVTE_Fused_Attn_Backend(IntEnum):
+    NVTE_No_Backend = 0
+    NVTE_F16_max512_seqlen = 1
+    NVTE_F16_arbitrary_seqlen = 2
+    NVTE_FP8 = 3
+    NVTE_FA3 = 4
+
+class NVTE_QKV_Format(IntEnum):
+    NVTE_BSHD = 0
+    NVTE_SBHD = 1
+    NVTE_THD = 2
+    NVTE_SBHD_2BSHD = 3
+    NVTE_BSHD_2SBHD = 4
+    NVTE_THD_2BSHD = 5
+    NVTE_THD_2SBHD = 6
+
+class NVTE_QKV_Layout(IntEnum):
+    NVTE_SB3HD = 0
+    NVTE_SBH3D = 1
+    NVTE_SBHD_SB2HD = 2
+    NVTE_SBHD_SBH2D = 3
+    NVTE_SBHD_SBHD_SBHD = 4
+    NVTE_BS3HD = 5
+    NVTE_BSH3D = 6
+    NVTE_BSHD_BS2HD = 7
+    NVTE_BSHD_BSH2D = 8
+    NVTE_BSHD_BSHD_BSHD = 9
+    NVTE_T3HD = 10
+    NVTE_TH3D = 11
+    NVTE_THD_T2HD = 12
+    NVTE_THD_TH2D = 13
+    NVTE_THD_THD_THD = 14
+    NVTE_SBHD_BSHD_BSHD = 15
+    NVTE_BSHD_SBHD_SBHD = 16
+    NVTE_THD_BSHD_BSHD = 17
+    NVTE_THD_SBHD_SBHD = 18
+    NVTE_Paged_KV_BSHD_BSHD_BSHD = 19
+    NVTE_Paged_KV_BSHD_SBHD_SBHD = 20
+    NVTE_Paged_KV_SBHD_BSHD_BSHD = 21
+    NVTE_Paged_KV_SBHD_SBHD_SBHD = 22
+    NVTE_Paged_KV_THD_BSHD_BSHD = 23
+    NVTE_Paged_KV_THD_SBHD_SBHD = 24
+
+class CommOverlapType(IntEnum):
+    RS = 0
+    AG = 1
+
+class CommOverlapAlgo(IntEnum):
+    BULK_OVERLAP_AG = 0
+    BULK_OVERLAP_RS = 1
+    SPLIT_PIPELINED_AG_P2P = 2
+    SPLIT_PIPELINED_RS = 3
+    SPLIT_PIPELINED_RS_P2P = 4
+    ATOMIC_GEMM_RS = 5
+    ATOMIC_GEMM_AG_P2P = 6
+    ATOMIC_GEMM_RS_P2P = 7
+    EXTERNAL_BULK_OVERLAP_AG = 8
+
+class FP8TensorMeta:
+    def __init__(self):
+        self.scale: Optional[torch.Tensor] = None
+        self.scale_inv: Optional[torch.Tensor] = None
+        self.amax_history: Optional[torch.Tensor] = None
+
+class CommGemmOverlapAlgoConfig:
+    def __init__(self, *args, **kwargs):
+        pass
+
+class FusedAdamCUDAKernel:
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(
+            "FusedAdamCUDAKernel requires CUDA extensions. "
+            "Not supported in FL mode."
+        )
+
+class FusedSGDCUDAKernel:
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(
+            "FusedSGDCUDAKernel requires CUDA extensions. "
+            "Not supported in FL mode."
+        )
+
+class CommOverlapHelper:
+    def __init__(self, world_group=None, intra_node_group=None):
+        self.world_group = world_group
+        self.intra_node_group = intra_node_group
+
+class CommOverlap:
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(
+            "CommOverlap should be created via backend.create_comm_overlap(). "
+            "Direct instantiation is not supported in FL mode."
+        )
+
+class CommOverlapP2P:
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(
+            "CommOverlapP2P should be created via backend.create_comm_overlap_p2p(). "
+            "Direct instantiation is not supported in FL mode."
+        )
+
+class TEFLBackendBase(ABC):
+    @abstractmethod
+    def is_available(self) -> bool:
+        raise NotImplementedError
+
+    def get_flash_attention_class(self) -> Type["FlashAttentionBase"]:
+        raise NotImplementedError
+
+    def quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        output: Optional[torch.Tensor] = None,
+        noop: Optional[torch.Tensor] = None,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dequantize(
+        self,
+        input: torch.Tensor,
+        otype: torch.dtype,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def bgrad_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError
+
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        raise NotImplementedError
+
+    def te_general_grouped_gemm(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def gelu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def geglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def qgelu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def qgeglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def relu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def reglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def srelu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def sreglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def silu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def clamped_swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dgelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dgeglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dqgelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dqgeglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def drelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dreglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dsrelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dsreglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dsilu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dbias_dgelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError
+
+    def dbias_dsilu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError
+
+    def dbias_drelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError
+
+    def dbias_dqgelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError
+
+    def dbias_dsrelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        raise NotImplementedError
+
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def layernorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        raise NotImplementedError
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def rmsnorm_bwd_add(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def multi_tensor_quantize(
+        self,
+        tensor_list: List[torch.Tensor],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        raise NotImplementedError
+
+    def split_quantize(
+        self,
+        tensor: torch.Tensor,
+        split_sections: List[int],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        raise NotImplementedError
+
+    def moe_permute_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+
+    def moe_permute_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+
+    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+
+    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def get_fused_attn_backend(
+        self,
+        *args,
+        **kwargs,
+    ) -> int:
+        raise NotImplementedError
+
+    def fused_attn_fwd(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_attn_bwd(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fa_prepare_fwd(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fa_prepare_bwd(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def copy_to_kv_cache(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def convert_thd_to_bshd(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def convert_bshd_to_thd(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_rope_forward(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_rope_backward(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_qkv_rope_forward(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_qkv_rope_backward(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_topk_with_score_function_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        num_groups: int,
+        group_topk: int,
+        scaling_factor: float,
+        score_function: Any,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_topk_with_score_function_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        routing_map: torch.Tensor,
+        intermediate_output: torch.Tensor,
+        grad_probs: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        scaling_factor: float,
+        score_function: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_score_for_moe_aux_loss_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_score_for_moe_aux_loss_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        intermediate_output: torch.Tensor,
+        grad_scores: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_moe_aux_loss_fwd(
+        self,
+        probs: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        total_num_tokens: int,
+        num_experts: int,
+        num_rows: int,
+        num_cols: int,
+        topk: int,
+        coeff: float,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_moe_aux_loss_bwd(
+        self,
+        Const_buf: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        num_rows: int,
+        num_cols: int,
+        grad_aux_loss: torch.Tensor,
+    ) -> Any:
+        raise NotImplementedError
+
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: Any,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    def swap_first_dims(
+        self,
+        tensor: torch.Tensor,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    def compute_amax(
+        self,
+        input: torch.Tensor,
+        amax: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    def fused_amax_and_scale_update_after_reduction(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        raise NotImplementedError
+
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: Any,
+    ) -> None:
+        raise NotImplementedError
+
+    def fused_multi_row_padding(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def fused_multi_row_unpadding(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def get_cublasLt_version(self) -> int:
+        raise NotImplementedError
+
+    def get_cudnn_version(self) -> int:
+        raise NotImplementedError
+
+    def get_num_cublas_streams(self) -> int:
+        raise NotImplementedError
+
+    def thd_read_half_tensor(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def thd_second_half_lse_correction(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def thd_read_second_half_lse(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def thd_out_correction(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def thd_grad_correction(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def thd_get_partitioned_indices(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any:
+        raise NotImplementedError
+
+    def init_nvshmem_backend(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def create_nvshmem_tensor(
+        self,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def nvshmem_send_on_current_stream(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvshmem_wait_on_current_stream(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvshmem_finalize(self) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        raise NotImplementedError
+
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        raise NotImplementedError
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        raise NotImplementedError
+
+    def multi_tensor_adam_param_remainder(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_adam_fp8(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_adam_capturable(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_adam_capturable_master(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_sgd(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def multi_tensor_compute_scale_and_scale_inv(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def bulk_overlap_ag_with_external_gemm(
+        self,
+        allgather_communicator: Any,
+        send_stream: Any,
+        recv_stream: Any,
+    ) -> Any:
+        raise NotImplementedError
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        raise NotImplementedError
+
+    def create_comm_overlap_helper(
+        self,
+        world_group: Optional[Any] = None,
+        intra_node_group: Optional[Any] = None,
+    ) -> Any:
+        raise NotImplementedError
+
+    def create_comm_overlap(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        num_splits: int = 3,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 2,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 16,
+        set_sm_margin: bool = True,
+        atomic_gemm: bool = False,
+        rs_overlap_first_gemm: bool = False,
+    ) -> Any:
+        raise NotImplementedError
+
+    def create_comm_overlap_p2p(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        comm_type: Any,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 1,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 1,
+        set_sm_margin: bool = False,
+        atomic_gemm: bool = False,
+        use_ce: bool = True,
+        aggregate: bool = False,
+    ) -> Any:
+        raise NotImplementedError
+
+class FlashAttentionBase(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.softmax_scale = softmax_scale
+        self.attention_dropout = attention_dropout
+        self.attention_dropout_ctx = attention_dropout_ctx or nullcontext
+        self.attention_type = attention_type
+        self.layer_number = 1 if layer_number is None else layer_number
+        self.deterministic = deterministic
+
+    def forward(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Subclasses must implement forward()")
+
+    @property
+    def backend_name(self) -> str:
+        return self.__class__.__name__
+
+
+class TEFLModule:
+    def __init__(self, manager=None):
+        """
+        Initialize TEFLModule.
+
+        Args:
+            manager: OpManager instance for operator dispatch.
+                       If None, will use the global default OpManager.
+        """
+        # Import here to avoid circular dependency
+        from .manager import get_default_manager
+        from .logger_manager import get_logger
+
+        self._manager = manager if manager is not None else get_default_manager()
+        self._logger = get_logger()
+
+        self.DType = DType
+        self.Float8BlockScaleTensorFormat = Float8BlockScaleTensorFormat
+        self.FP8FwdTensors = FP8FwdTensors
+        self.FP8BwdTensors = FP8BwdTensors
+        self.FP8TensorMeta = FP8TensorMeta
+        self.NVTE_Activation_Type = NVTE_Activation_Type
+        self.NVTE_Bias_Type = NVTE_Bias_Type
+        self.NVTE_Mask_Type = NVTE_Mask_Type
+        self.NVTE_Softmax_Type = NVTE_Softmax_Type
+        self.NVTE_Fused_Attn_Backend = NVTE_Fused_Attn_Backend
+        self.NVTE_QKV_Format = NVTE_QKV_Format
+        self.NVTE_QKV_Layout = NVTE_QKV_Layout
+        self.CommOverlapType = CommOverlapType
+        self.CommOverlapAlgo = CommOverlapAlgo
+        self.CommGemmOverlapRole = CommGemmOverlapRole
+
+        self.CommOverlapHelper = CommOverlapHelper
+        self.CommOverlap = CommOverlap
+        self.CommOverlapP2P = CommOverlapP2P
+        self.CommGemmOverlapAlgoConfig = CommGemmOverlapAlgoConfig
+
+        self.FusedAdamCUDAKernel = FusedAdamCUDAKernel
+        self.FusedSGDCUDAKernel = FusedSGDCUDAKernel
+
+    def __getattr__(self, name: str) -> Any:
+        """
+        Dynamically resolve operators through OpManager.
+        """
+        if name.startswith('_'):
+            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+        # Verify the operator exists before returning the bound call method
+        try:
+            self._manager.ensure_initialized()
+            available_ops = self._manager.registry.list_operators()
+            if name not in available_ops:
+                raise AttributeError(
+                    f"Operator '{name}' not found. "
+                    f"Available operators: {available_ops}"
+                )
+        except RuntimeError as e:
+            # Re-raise as AttributeError for better error messages
+            raise AttributeError(
+                f"Error accessing operator '{name}': {e}"
+            ) from e
+
+        # Return a bound call method for this operator
+        import functools
+        return functools.partial(self._manager.call, name)
+
+    def __dir__(self):
+        module_attrs = [
+            'DType', 'Float8BlockScaleTensorFormat', 'FP8FwdTensors', 'FP8BwdTensors',
+            'FP8TensorMeta', 'NVTE_Activation_Type', 'NVTE_Bias_Type', 'NVTE_Mask_Type',
+            'NVTE_Softmax_Type', 'NVTE_Fused_Attn_Backend', 'NVTE_QKV_Format', 'NVTE_QKV_Layout',
+            'CommOverlapType', 'CommOverlapAlgo', 'CommGemmOverlapRole',
+            'CommOverlapHelper', 'CommOverlap', 'CommOverlapP2P', 'CommGemmOverlapAlgoConfig',
+            'FusedAdamCUDAKernel', 'FusedSGDCUDAKernel'
+        ]
+
+        # Add operator names from OpManager's registry
+        op_attrs = self._manager.registry.list_operators()
+
+        return list(set(module_attrs + op_attrs))
+
+    def __getitem__(self, key: str):
+        return self.__getattr__(key)
+
+    @property
+    def __all__(self):
+        return self.__dir__()
+
+    def flash_attention(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> "FlashAttentionBase":
+        """
+        Get FlashAttention implementation through OpManager.
+        """
+        # Get the flash attention class getter through OpManager.call
+        # This provides the same fallback support and logging as other operators
+        flash_attn_class = self._manager.call("get_flash_attention_class")
+
+        # Instantiate and return the FlashAttention
+        return flash_attn_class(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+    def __repr__(self) -> str:
+        op_count = len(self._manager.registry.list_operators())
+        return f"TEFLModule(operators={op_count}, manager={self._manager.__class__.__name__})"
+
+# Global singleton instance
+_global_tefl_module: Optional[TEFLModule] = None
+_tefl_module_lock = None
+
+def get_tefl_module() -> TEFLModule:
+    """
+    Get or create the global TEFLModule instance.
+
+    This function returns a singleton TEFLModule that uses the default OpManager.
+    The instance is created lazily on first access.
+
+    Returns:
+        The global TEFLModule instance
+
+    Example:
+        >>> import core as te_fl
+        >>> # Or explicitly:
+        >>> from core.base import get_tefl_module
+        >>> te_fl = get_tefl_module()
+        >>> result = te_fl.rmsnorm_fwd(input, weight, eps=1e-5)
+    """
+    global _global_tefl_module, _tefl_module_lock
+
+    if _global_tefl_module is None:
+        # Import here to avoid issues at module load time
+        import threading
+
+        if _tefl_module_lock is None:
+            _tefl_module_lock = threading.RLock()
+
+        with _tefl_module_lock:
+            if _global_tefl_module is None:
+                _global_tefl_module = TEFLModule()
+
+    return _global_tefl_module
+
+def reset_tefl_module() -> None:
+    """
+    Reset the global TEFLModule instance.
+
+    This is primarily useful for testing. After calling this function,
+    the next call to get_tefl_module() will create a fresh instance.
+
+    Warning:
+        This function is not thread-safe and should only be used in
+        single-threaded test environments.
+    """
+    global _global_tefl_module, _tefl_module_lock
+
+    if _tefl_module_lock is None:
+        import threading
+        _tefl_module_lock = threading.RLock()
+
+    with _tefl_module_lock:
+        _global_tefl_module = None
+
+# Backward compatibility functions
+def get_registry():
+    """
+    Get the global OpRegistry instance (via OpManager).
+
+    DEPRECATED: Use get_default_manager().registry instead.
+
+    This function is kept for backward compatibility with code that
+    expects the old API.
+
+    Returns:
+        The OpRegistry instance from the default OpManager
+
+    Example:
+        >>> from core.base import get_registry
+        >>> registry = get_registry()
+        >>> ops = registry.list_operators()
+    """
+    from .manager import get_default_manager
+    return get_default_manager().registry
+
+def get_manager():
+    """
+    Get the global OpManager instance.
+
+    This is the recommended way to access the OpManager.
+
+    Returns:
+        The default OpManager instance
+
+    Example:
+        >>> from core.base import get_manager
+        >>> manager = get_manager()
+        >>> impl_fn = manager.resolve("rmsnorm_fwd")
+    """
+    from .manager import get_default_manager
+    return get_default_manager()
+
+def reset_registry() -> None:
+    """
+    Reset the global OpManager and OpRegistry.
+
+    DEPRECATED: Use reset_default_manager() instead.
+
+    This function is kept for backward compatibility.
+    """
+    from .manager import reset_default_manager
+    reset_default_manager()
+    # Also reset the TEFLModule singleton since it depends on OpManager
+    reset_tefl_module()
diff --git a/transformer_engine/plugin/core/policy.py b/transformer_engine/plugin/core/policy.py
new file mode 100644
index 0000000000..9e4a196c3b
--- /dev/null
+++ b/transformer_engine/plugin/core/policy.py
@@ -0,0 +1,396 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from __future__ import annotations
+
+import contextvars
+import os
+import threading
+from dataclasses import dataclass, field
+from typing import Dict, FrozenSet, List, Optional, Set, Tuple
+
+from .types import BackendImplKind
+
+
+# Valid preference values for TE_FL_PREFER
+PREFER_DEFAULT = "flagos"
+PREFER_VENDOR = "vendor"
+PREFER_REFERENCE = "reference"
+
+VALID_PREFER_VALUES = frozenset({PREFER_DEFAULT, PREFER_VENDOR, PREFER_REFERENCE})
+
+
+@dataclass(frozen=True)
+class SelectionPolicy:
+    """
+    Policy for selecting operator implementations.
+
+    Attributes:
+        prefer: Which implementation kind to prefer. One of:
+            - "flagos": Prefer DEFAULT (FlagOS) implementations
+            - "vendor": Prefer VENDOR (CUDA) implementations
+            - "reference": Prefer REFERENCE (PyTorch) implementations
+        strict: If True, raise error when primary implementation fails
+        per_op_order: Per-operator custom selection order
+        deny_vendors: Set of vendor names to deny
+        allow_vendors: Set of vendor names to allow (whitelist)
+    """
+    prefer: str = PREFER_DEFAULT
+    strict: bool = False
+    per_op_order: Tuple[Tuple[str, Tuple[str, ...]], ...] = field(default_factory=tuple)
+
+    deny_vendors: FrozenSet[str] = field(default_factory=frozenset)
+    allow_vendors: Optional[FrozenSet[str]] = None
+
+    def __post_init__(self):
+        if self.prefer not in VALID_PREFER_VALUES:
+            raise ValueError(
+                f"Invalid prefer value: '{self.prefer}'. "
+                f"Must be one of: {', '.join(sorted(VALID_PREFER_VALUES))}"
+            )
+
+    @classmethod
+    def from_dict(
+        cls,
+        prefer: str = PREFER_DEFAULT,
+        strict: bool = False,
+        per_op_order: Optional[Dict[str, List[str]]] = None,
+        deny_vendors: Optional[Set[str]] = None,
+        allow_vendors: Optional[Set[str]] = None,
+    ) -> "SelectionPolicy":
+        per_op_tuple = tuple()
+        if per_op_order:
+            per_op_tuple = tuple(
+                (k, tuple(v)) for k, v in sorted(per_op_order.items())
+            )
+
+        return cls(
+            prefer=prefer.lower(),
+            strict=strict,
+            per_op_order=per_op_tuple,
+            deny_vendors=frozenset(deny_vendors) if deny_vendors else frozenset(),
+            allow_vendors=frozenset(allow_vendors) if allow_vendors else None,
+        )
+
+    @property
+    def per_op_order_dict(self) -> Dict[str, List[str]]:
+        """Get per_op_order as a mutable dict for easier access"""
+        return {k: list(v) for k, v in self.per_op_order}
+
+    def get_per_op_order(self, op_name: str) -> Optional[List[str]]:
+        """Get order for a specific operator"""
+        for name, order in self.per_op_order:
+            if name == op_name:
+                return list(order)
+        return None
+
+    def get_default_order(self) -> List[str]:
+        """Get the default selection order based on preference setting."""
+        if self.prefer == PREFER_REFERENCE:
+            return ["reference", "flagos", "vendor"]
+        elif self.prefer == PREFER_VENDOR:
+            return ["vendor", "flagos", "reference"]
+        else:  # PREFER_DEFAULT
+            return ["flagos", "vendor", "reference"]
+
+    def is_vendor_allowed(self, vendor_name: str) -> bool:
+        if vendor_name in self.deny_vendors:
+            return False
+        if self.allow_vendors is not None and vendor_name not in self.allow_vendors:
+            return False
+        return True
+
+    def fingerprint(self) -> str:
+        parts = [
+            f"prefer={self.prefer}",
+            f"st={int(self.strict)}",
+        ]
+
+        if self.allow_vendors:
+            parts.append(f"allow={','.join(sorted(self.allow_vendors))}")
+
+        if self.deny_vendors:
+            parts.append(f"deny={','.join(sorted(self.deny_vendors))}")
+
+        if self.per_op_order:
+            per_op_str = ";".join(
+                f"{k}={'|'.join(v)}" for k, v in self.per_op_order
+            )
+            parts.append(f"per={per_op_str}")
+
+        return ";".join(parts)
+
+    def __hash__(self) -> int:
+        return hash((
+            self.prefer,
+            self.strict,
+            self.per_op_order,
+            self.deny_vendors,
+            self.allow_vendors,
+        ))
+
+
+class PolicyManager:
+    _instance = None
+    _lock = threading.Lock()
+
+    def __init__(self):
+        if hasattr(self, '_policy_epoch'):
+            return
+
+        self._policy_epoch = 0
+        self._policy_epoch_lock = threading.Lock()
+        self._global_policy = None
+        self._global_policy_lock = threading.Lock()
+
+        self._policy_var = contextvars.ContextVar(
+            "te_fl_selection_policy",
+            default=None,
+        )
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls.__new__(cls)
+                    cls._instance.__init__()
+        return cls._instance
+
+    def get_policy_epoch(self) -> int:
+        return self._policy_epoch
+
+    def bump_policy_epoch(self) -> int:
+        with self._policy_epoch_lock:
+            self._policy_epoch += 1
+            return self._policy_epoch
+
+    def get_policy(self) -> SelectionPolicy:
+        ctx_policy = self._policy_var.get()
+        if ctx_policy is not None:
+            return ctx_policy
+
+        if self._global_policy is None:
+            with self._global_policy_lock:
+                if self._global_policy is None:
+                    self._global_policy = self._policy_from_env()
+        return self._global_policy
+
+    def set_global_policy(self, policy: SelectionPolicy) -> SelectionPolicy:
+        with self._global_policy_lock:
+            old_policy = self._global_policy
+            self._global_policy = policy
+            self.bump_policy_epoch()
+            return old_policy if old_policy else self._policy_from_env()
+
+    def reset_global_policy(self) -> None:
+        with self._global_policy_lock:
+            self._global_policy = None
+            self.bump_policy_epoch()
+
+    def create_policy_context(self, policy: SelectionPolicy):
+        return _PolicyContext(self, policy)
+
+    def _get_policy_var(self):
+        return self._policy_var
+
+    @staticmethod
+    def _parse_csv_set(value: str) -> Set[str]:
+        if not value:
+            return set()
+        return {x.strip() for x in value.split(",") if x.strip()}
+
+    @staticmethod
+    def _parse_per_op(value: str) -> Dict[str, List[str]]:
+        if not value:
+            return {}
+
+        result: Dict[str, List[str]] = {}
+        parts = [p.strip() for p in value.split(";") if p.strip()]
+
+        for part in parts:
+            if "=" not in part:
+                continue
+            op_name, order_str = part.split("=", 1)
+            op_name = op_name.strip()
+            order = [x.strip() for x in order_str.split("|") if x.strip()]
+            if op_name and order:
+                result[op_name] = order
+
+        return result
+
+    def _policy_from_env(self) -> SelectionPolicy:
+        # Priority: TE_FL_PREFER (highest) > TE_FL_PREFER_VENDOR (legacy)
+        #
+        # TE_FL_PREFER: Explicit preference by name (flagos, vendor, reference)
+        # TE_FL_PREFER_VENDOR: Legacy boolean flag (1=vendor, 0=flagos)
+
+        prefer_str = None
+
+        # 1. Check TE_FL_PREFER first (highest priority)
+        te_fl_prefer = os.environ.get("TE_FL_PREFER", "").strip().lower()
+        if te_fl_prefer:
+            if te_fl_prefer in VALID_PREFER_VALUES:
+                prefer_str = te_fl_prefer
+            else:
+                print(f"[WARNING] Invalid TE_FL_PREFER value: '{te_fl_prefer}'. "
+                      f"Valid values: {', '.join(sorted(VALID_PREFER_VALUES))}")
+
+        # 2. Fall back to TE_FL_PREFER_VENDOR (legacy)
+        if prefer_str is None:
+            prefer_vendor = os.environ.get("TE_FL_PREFER_VENDOR", "").strip()
+            if prefer_vendor == "1":
+                prefer_str = PREFER_VENDOR
+            elif prefer_vendor == "0":
+                prefer_str = PREFER_DEFAULT
+            else:
+                # Default behavior: prefer default (FlagOS)
+                prefer_str = PREFER_DEFAULT
+
+        strict = os.environ.get("TE_FL_STRICT", "0").strip() == "1"
+
+        deny_str = os.environ.get("TE_FL_DENY_VENDORS", "").strip()
+        deny_vendors = self._parse_csv_set(deny_str) if deny_str else None
+
+        allow_str = os.environ.get("TE_FL_ALLOW_VENDORS", "").strip()
+        allow_vendors = self._parse_csv_set(allow_str) if allow_str else None
+
+        per_op_str = os.environ.get("TE_FL_PER_OP", "").strip()
+        per_op_order = self._parse_per_op(per_op_str) if per_op_str else None
+
+        return SelectionPolicy.from_dict(
+            prefer=prefer_str,
+            strict=strict,
+            per_op_order=per_op_order,
+            deny_vendors=deny_vendors,
+            allow_vendors=allow_vendors,
+        )
+
+
+class _PolicyContext:
+
+    def __init__(self, manager: PolicyManager, policy: SelectionPolicy):
+        self._manager = manager
+        self._policy = policy
+        self._token: Optional[contextvars.Token] = None
+
+    def __enter__(self) -> "_PolicyContext":
+        policy_var = self._manager._get_policy_var()
+        self._token = policy_var.set(self._policy)
+        self._manager.bump_policy_epoch()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._token is not None:
+            policy_var = self._manager._get_policy_var()
+            policy_var.reset(self._token)
+            self._manager.bump_policy_epoch()
+
+
+# Convenience functions for easier access
+def get_policy_epoch() -> int:
+    """Get the current policy epoch"""
+    return PolicyManager.get_instance().get_policy_epoch()
+
+
+def bump_policy_epoch() -> int:
+    """Bump the policy epoch and return the new value"""
+    return PolicyManager.get_instance().bump_policy_epoch()
+
+
+def get_policy() -> SelectionPolicy:
+    """Get the current effective policy (context or global)"""
+    return PolicyManager.get_instance().get_policy()
+
+
+def set_global_policy(policy: SelectionPolicy) -> SelectionPolicy:
+    """Set the global policy and return the old policy"""
+    return PolicyManager.get_instance().set_global_policy(policy)
+
+
+def reset_global_policy() -> None:
+    """Reset the global policy to environment defaults"""
+    PolicyManager.get_instance().reset_global_policy()
+
+
+def policy_from_env() -> SelectionPolicy:
+    """Create a SelectionPolicy from environment variables"""
+    return PolicyManager.get_instance()._policy_from_env()
+
+
+def policy_context(policy: SelectionPolicy) -> _PolicyContext:
+    """
+    Create a context manager to temporarily override the policy.
+
+    Example:
+        >>> with policy_context(my_policy):
+        ...     # Use my_policy in this context
+        ...     result = manager.resolve("op_name")
+    """
+    return _PolicyContext(PolicyManager.get_instance(), policy)
+
+
+# Convenience context managers
+def with_strict_mode() -> _PolicyContext:
+    """Context manager to enable strict mode"""
+    current = get_policy()
+    strict_policy = SelectionPolicy.from_dict(
+        prefer=current.prefer,
+        strict=True,
+        per_op_order={k: list(v) for k, v in current.per_op_order},
+        deny_vendors=set(current.deny_vendors),
+        allow_vendors=set(current.allow_vendors) if current.allow_vendors else None,
+    )
+    return policy_context(strict_policy)
+
+
+def with_preference(prefer: str) -> _PolicyContext:
+    """
+    Context manager to set implementation preference.
+
+    Args:
+        prefer: One of "flagos", "vendor", or "reference"
+
+    Example:
+        >>> with with_preference("vendor"):
+        ...     # Prefer vendor implementations in this context
+        ...     result = manager.resolve("op_name")
+    """
+    current = get_policy()
+    policy = SelectionPolicy.from_dict(
+        prefer=prefer,
+        strict=current.strict,
+        per_op_order={k: list(v) for k, v in current.per_op_order},
+        deny_vendors=set(current.deny_vendors),
+        allow_vendors=set(current.allow_vendors) if current.allow_vendors else None,
+    )
+    return policy_context(policy)
+
+
+def with_allowed_vendors(*vendors: str) -> _PolicyContext:
+    """Context manager to set allowed vendors whitelist"""
+    current = get_policy()
+    policy = SelectionPolicy.from_dict(
+        prefer=current.prefer,
+        strict=current.strict,
+        per_op_order={k: list(v) for k, v in current.per_op_order},
+        deny_vendors=set(current.deny_vendors),
+        allow_vendors=set(vendors),
+    )
+    return policy_context(policy)
+
+
+def with_denied_vendors(*vendors: str) -> _PolicyContext:
+    """Context manager to add denied vendors to blacklist"""
+    current = get_policy()
+    denied = set(current.deny_vendors)
+    denied.update(vendors)
+    policy = SelectionPolicy.from_dict(
+        prefer=current.prefer,
+        strict=current.strict,
+        per_op_order={k: list(v) for k, v in current.per_op_order},
+        deny_vendors=denied,
+        allow_vendors=set(current.allow_vendors) if current.allow_vendors else None,
+    )
+    return policy_context(policy)
diff --git a/transformer_engine/plugin/core/registry.py b/transformer_engine/plugin/core/registry.py
new file mode 100644
index 0000000000..bd08241b3b
--- /dev/null
+++ b/transformer_engine/plugin/core/registry.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from __future__ import annotations
+
+import threading
+from dataclasses import dataclass
+from typing import Dict, List, Sequence
+
+from .types import OpImpl
+
+
+@dataclass
+class OpRegistrySnapshot:
+    """Immutable snapshot of operator registry state"""
+    impls_by_op: Dict[str, List[OpImpl]]
+
+
+class OpRegistry:
+    """
+    Thread-safe registry for operator implementations.
+
+    This registry stores operator implementations indexed by op_name and impl_id.
+    Each operator can have multiple implementations from different backends/vendors.
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.RLock()
+        # Structure: {op_name: {impl_id: OpImpl}}
+        self._impls_by_op: Dict[str, Dict[str, OpImpl]] = {}
+
+    def register_impl(self, impl: OpImpl) -> None:
+        """
+        Register a single operator implementation.
+
+        Args:
+            impl: OpImpl instance to register
+
+        Raises:
+            ValueError: If impl_id is already registered for this op_name
+        """
+        with self._lock:
+            by_id = self._impls_by_op.setdefault(impl.op_name, {})
+            if impl.impl_id in by_id:
+                raise ValueError(
+                    f"Duplicate impl_id '{impl.impl_id}' for op='{impl.op_name}'. "
+                    f"Existing: {by_id[impl.impl_id]}, New: {impl}"
+                )
+            by_id[impl.impl_id] = impl
+
+    def register_many(self, impls: Sequence[OpImpl]) -> None:
+        """
+        Register multiple operator implementations.
+
+        Args:
+            impls: Sequence of OpImpl instances to register
+        """
+        for impl in impls:
+            self.register_impl(impl)
+
+    def snapshot(self) -> OpRegistrySnapshot:
+        """
+        Create an immutable snapshot of current registry state.
+
+        Returns:
+            OpRegistrySnapshot with all registered implementations
+        """
+        with self._lock:
+            impls_by_op = {
+                op: list(by_id.values())
+                for op, by_id in self._impls_by_op.items()
+            }
+        return OpRegistrySnapshot(impls_by_op=impls_by_op)
+
+    def get_implementations(self, op_name: str) -> List[OpImpl]:
+        """
+        Get all implementations for a specific operator.
+
+        Args:
+            op_name: Name of the operator
+
+        Returns:
+            List of OpImpl for the operator (empty if not found)
+        """
+        with self._lock:
+            by_id = self._impls_by_op.get(op_name, {})
+            return list(by_id.values())
+
+    def get_implementation(self, op_name: str, impl_id: str) -> OpImpl | None:
+        """
+        Get a specific implementation by op_name and impl_id.
+
+        Args:
+            op_name: Name of the operator
+            impl_id: Implementation ID
+
+        Returns:
+            OpImpl if found, None otherwise
+        """
+        with self._lock:
+            by_id = self._impls_by_op.get(op_name, {})
+            return by_id.get(impl_id)
+
+    def list_operators(self) -> List[str]:
+        """
+        List all registered operator names.
+
+        Returns:
+            List of operator names
+        """
+        with self._lock:
+            return list(self._impls_by_op.keys())
+
+    def clear(self) -> None:
+        """Clear all registered implementations"""
+        with self._lock:
+            self._impls_by_op.clear()
diff --git a/transformer_engine/plugin/core/types.py b/transformer_engine/plugin/core/types.py
new file mode 100644
index 0000000000..e5508320f2
--- /dev/null
+++ b/transformer_engine/plugin/core/types.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Optional, Set
+
+
+class BackendImplKind(str, Enum):
+    DEFAULT = "flagos"
+    REFERENCE = "reference"
+    VENDOR = "vendor"
+
+    def __str__(self) -> str:
+        return self.value
+
+
+@dataclass(frozen=True)
+class OpImpl:
+    op_name: str
+    impl_id: str
+    kind: BackendImplKind
+    fn: Callable[..., Any]
+    vendor: Optional[str] = None
+    priority: int = 0
+    supported_dtypes: Optional[Set[str]] = None
+    min_arch: Optional[str] = None
+
+    def __post_init__(self):
+        if self.kind == BackendImplKind.VENDOR and not self.vendor:
+            raise ValueError(f"OpImpl with kind=VENDOR must specify vendor name: {self.impl_id}")
+
+    def is_available(self) -> bool:
+        avail_fn = getattr(self.fn, "_is_available", None)
+        if callable(avail_fn):
+            try:
+                return bool(avail_fn())
+            except Exception:
+                return False
+        return True
+
+
+TOKEN_PATTERNS = {
+    "flagos": lambda impl: impl.kind == BackendImplKind.DEFAULT,
+    "reference": lambda impl: impl.kind == BackendImplKind.REFERENCE,
+    "vendor": lambda impl: impl.kind == BackendImplKind.VENDOR,
+}
+
+
+def match_token(impl: OpImpl, token: str) -> bool:
+    if token in TOKEN_PATTERNS:
+        return TOKEN_PATTERNS[token](impl)
+
+    if token.startswith("vendor:"):
+        vendor_name = token.split(":", 1)[1]
+        return impl.kind == BackendImplKind.VENDOR and impl.vendor == vendor_name
+
+    if token.startswith("impl:"):
+        impl_id = token.split(":", 1)[1]
+        return impl.impl_id == impl_id
+
+    return False
diff --git a/transformer_engine/plugin/examples/README.md b/transformer_engine/plugin/examples/README.md
new file mode 100644
index 0000000000..318de59487
--- /dev/null
+++ b/transformer_engine/plugin/examples/README.md
@@ -0,0 +1,181 @@
+# TE-FL Custom Backend Examples
+
+This directory contains examples demonstrating two ways to add custom backends.
+
+## Two Approaches
+
+| Approach | Use Case | Example File |
+|----------|----------|--------------|
+| **In-tree** | Open source contribution, direct integration | `example_intree.py` |
+| **Out-of-tree** | Closed-source / third-party plugin, standalone package | `example_outtree.py` |
+
+## Quick Start
+
+```bash
+cd transformer_engine/plugin/examples
+
+# In-tree approach
+python example_intree.py
+
+# Out-of-tree approach
+python example_outtree.py
+```
+
+## In-tree Approach (3 Steps)
+
+```python
+from transformer_engine.plugin.core import (
+    OpRegistry, OpManager, OpImpl, BackendImplKind
+)
+
+# 1. Define your operator implementation
+def my_rmsnorm(input, weight, eps=1e-5, **kwargs):
+    variance = input.pow(2).mean(-1, keepdim=True)
+    return input * torch.rsqrt(variance + eps) * weight, torch.rsqrt(variance + eps)
+
+# 2. Register to Registry
+registry = OpRegistry()
+registry.register_impl(OpImpl(
+    op_name="rmsnorm_fwd",
+    impl_id="vendor.mybackend",
+    kind=BackendImplKind.VENDOR,
+    vendor="mybackend",
+    fn=my_rmsnorm,
+    priority=200,
+))
+
+# 3. Call via Manager
+manager = OpManager(registry)
+output, rsigma = manager.call("rmsnorm_fwd", input, weight)
+```
+
+## Out-of-tree Approach (Plugin Package)
+
+### Plugin Package Structure
+
+```
+my_vendor_plugin/
+├── __init__.py      # Contains register(registry) function
+└── setup.py         # or pyproject.toml
+```
+
+### \_\_init\_\_.py
+
+```python
+from transformer_engine.plugin.core import OpImpl, BackendImplKind
+
+def my_rmsnorm(input, weight, eps=1e-5, **kwargs):
+    # Your implementation
+    ...
+
+def register(registry):
+    """Called automatically by TE-FL"""
+    registry.register_impl(OpImpl(
+        op_name="rmsnorm_fwd",
+        impl_id="vendor.myvendor",
+        kind=BackendImplKind.VENDOR,
+        vendor="myvendor",
+        fn=my_rmsnorm,
+        priority=200,
+    ))
+```
+
+### Loading Methods
+
+```bash
+# Method 1: Environment variable
+export TE_FL_PLUGIN_MODULES=my_vendor_plugin
+python your_script.py
+
+# Method 2: pip install (requires entry_points configuration)
+pip install my-vendor-plugin
+python your_script.py
+```
+
+## Environment Variables
+
+### Backend Selection
+
+| Variable | Description | Values | Default |
+|----------|-------------|--------|---------|
+| `TE_FL_PREFER` | Preferred backend type (highest priority) | `flagos` / `vendor` / `reference` | `flagos` |
+| `TE_FL_PREFER_VENDOR` | Prefer vendor backend (legacy, lower priority than `TE_FL_PREFER`) | `1` = prefer vendor, `0` = prefer flagos | `0` |
+| `TE_FL_STRICT` | Strict mode - raise error if preferred implementation fails instead of fallback | `1` = strict, `0` = allow fallback | `0` |
+
+### Vendor Filtering
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `TE_FL_ALLOW_VENDORS` | Whitelist of allowed vendors (comma-separated) | `nvidia,amd` |
+| `TE_FL_DENY_VENDORS` | Blacklist of denied vendors (comma-separated) | `vendor_a,vendor_b` |
+
+### Per-Operator Configuration
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `TE_FL_PER_OP` | Per-operator backend ordering | `rmsnorm_fwd=vendor:acme\|flagos;rope_fwd=flagos\|reference` |
+
+Format: `op_name=backend1|backend2;op_name2=backend3|backend4`
+
+### Plugin Discovery
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `TE_FL_PLUGIN_MODULES` | Plugin modules to load (comma-separated) | `my_plugin,another_plugin` |
+
+### Build Configuration
+
+| Variable | Description | Values | Default |
+|----------|-------------|--------|---------|
+| `TE_FL_SKIP_CUDA` | Skip CUDA backend (both build-time and runtime) | `1` = skip, `0` = enable | `0` |
+| `CUDA_HOME` | CUDA installation path | `/usr/local/cuda` | Auto-detected |
+| `CUDA_PATH` | Alternative CUDA path variable | `/usr/local/cuda` | Auto-detected |
+
+### Logging
+
+| Variable | Description | Values | Default |
+|----------|-------------|--------|---------|
+| `TEFL_LOG_LEVEL` | Log level for TE-FL | `DEBUG` / `INFO` / `WARNING` / `ERROR` | `INFO` |
+
+## Examples
+
+### Prefer vendor backend
+```bash
+export TE_FL_PREFER=vendor
+python your_script.py
+```
+
+### Only allow specific vendors
+```bash
+export TE_FL_ALLOW_VENDORS=nvidia,acme
+python your_script.py
+```
+
+### Custom per-operator ordering
+```bash
+# Use acme vendor for rmsnorm, flagos for others
+export TE_FL_PER_OP="rmsnorm_fwd=vendor:acme|flagos"
+python your_script.py
+```
+
+### Skip CUDA and use FlagOS only
+```bash
+export TE_FL_SKIP_CUDA=1
+export TE_FL_PREFER=flagos
+python your_script.py
+```
+
+### Enable debug logging
+```bash
+export TEFL_LOG_LEVEL=DEBUG
+python your_script.py
+```
+
+## Expected Output
+
+When running, you should see logs like:
+
+```
+[TE-FL manager.py:133 INFO] Registered impl_ids: ['default.flagos', 'reference.torch', 'vendor.mybackend']
+[TE-FL manager.py:390 INFO] Op 'rmsnorm_fwd' using 'vendor.mybackend' (kind=vendor, vendor=mybackend)
+```
diff --git a/transformer_engine/plugin/examples/example_intree.py b/transformer_engine/plugin/examples/example_intree.py
new file mode 100644
index 0000000000..5c2052bb00
--- /dev/null
+++ b/transformer_engine/plugin/examples/example_intree.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Example: In-tree Backend Registration
+
+Use case: Add implementation directly to the codebase (open source contribution)
+
+Run:
+    python example_intree.py
+"""
+
+import torch
+from transformer_engine.plugin.core import (
+    OpRegistry,
+    OpManager,
+    OpImpl,
+    BackendImplKind,
+    SelectionPolicy,
+    set_global_policy,
+)
+
+
+# ============================================================
+# Step 1: Define your operator implementation
+# ============================================================
+def my_rmsnorm_fwd(input, weight, eps=1e-5, **kwargs):
+    """Custom RMSNorm implementation"""
+    print("  >>> [MyBackend] my_rmsnorm_fwd called!")
+    variance = input.pow(2).mean(-1, keepdim=True)
+    output = input * torch.rsqrt(variance + eps) * weight
+    rsigma = torch.rsqrt(variance + eps)
+    return output, rsigma
+
+
+# Optional: Define availability check function
+my_rmsnorm_fwd._is_available = lambda: True
+
+
+# ============================================================
+# Step 2: Register to Registry
+# ============================================================
+registry = OpRegistry()
+
+registry.register_impl(OpImpl(
+    op_name="rmsnorm_fwd",           # Operator name
+    impl_id="vendor.mybackend",      # Implementation ID (unique identifier)
+    kind=BackendImplKind.VENDOR,     # Type: VENDOR / DEFAULT / REFERENCE
+    vendor="mybackend",              # Vendor name
+    fn=my_rmsnorm_fwd,               # Implementation function
+    priority=200,                    # Priority (higher = preferred)
+))
+
+
+# ============================================================
+# Step 3: Create Manager and call operator
+# ============================================================
+manager = OpManager(registry)
+
+# Set policy: prefer vendor backend
+set_global_policy(SelectionPolicy(prefer="vendor"))
+
+# Prepare test data
+input_tensor = torch.randn(2, 4, 8)
+weight = torch.ones(8)
+
+# Call operator - will automatically select highest priority implementation
+print("\nCalling rmsnorm_fwd:")
+output, rsigma = manager.call("rmsnorm_fwd", input_tensor, weight, eps=1e-5)
+
+print(f"\nInput shape: {input_tensor.shape}")
+print(f"Output shape: {output.shape}")
+print("\nSuccess! Your custom backend was used.")
diff --git a/transformer_engine/plugin/examples/example_outtree.py b/transformer_engine/plugin/examples/example_outtree.py
new file mode 100644
index 0000000000..92eea892a6
--- /dev/null
+++ b/transformer_engine/plugin/examples/example_outtree.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Example: Out-of-tree Backend Registration
+
+Use case: Standalone plugin package (closed-source / third-party)
+
+Run:
+    # Method 1: Load plugin module via environment variable
+    TE_FL_PLUGIN_MODULES=my_vendor_plugin python example_outtree.py
+
+    # Method 2: Install plugin package with entry_points via pip
+    pip install my-vendor-plugin
+    python example_outtree.py
+"""
+
+import sys
+import types
+import torch
+
+
+# ============================================================
+# Step 1: Create plugin module (simulates a pip-installed package)
+# ============================================================
+def create_plugin_module():
+    """
+    Simulate a standalone plugin module.
+
+    In practice, this code would be in a separate pip package, e.g.:
+    - my_vendor_plugin/__init__.py
+    """
+
+    # Create module
+    plugin_module = types.ModuleType("my_vendor_plugin")
+
+    # Define operator implementation
+    def my_rmsnorm_fwd(input, weight, eps=1e-5, **kwargs):
+        """Custom RMSNorm implementation"""
+        print("  >>> [MyVendorPlugin] my_rmsnorm_fwd called!")
+        variance = input.pow(2).mean(-1, keepdim=True)
+        output = input * torch.rsqrt(variance + eps) * weight
+        rsigma = torch.rsqrt(variance + eps)
+        return output, rsigma
+
+    my_rmsnorm_fwd._is_available = lambda: True
+
+    # Define register function (must have 'register' or 'te_fl_register' function)
+    def register(registry):
+        """
+        Plugin registration function - called automatically by TE-FL.
+
+        Args:
+            registry: OpRegistry instance
+        """
+        from transformer_engine.plugin.core import (
+            OpImpl,
+            BackendImplKind,
+        )
+
+        print("[MyVendorPlugin] Registering operator implementations...")
+
+        registry.register_impl(OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="vendor.myvendor",
+            kind=BackendImplKind.VENDOR,
+            vendor="myvendor",
+            fn=my_rmsnorm_fwd,
+            priority=200,
+        ))
+
+        print("[MyVendorPlugin] Registration complete!")
+
+    # Add register function to module
+    plugin_module.register = register
+
+    return plugin_module
+
+
+# ============================================================
+# Step 2: Register plugin module to sys.modules (simulates pip install)
+# ============================================================
+plugin = create_plugin_module()
+sys.modules["my_vendor_plugin"] = plugin
+
+
+# ============================================================
+# Step 3: Set environment variables for TE-FL auto-discovery
+# ============================================================
+import os
+os.environ["TE_FL_PLUGIN_MODULES"] = "my_vendor_plugin"
+os.environ["TE_FL_PREFER"] = "vendor"  # Prefer vendor backend
+
+
+# ============================================================
+# Step 4: Import TE-FL (will auto-discover and load plugin)
+# ============================================================
+from transformer_engine.plugin.core import (
+    get_manager,
+    reset_default_manager,
+)
+
+# Reset manager to trigger plugin discovery
+reset_default_manager()
+manager = get_manager()
+
+
+# ============================================================
+# Step 5: Call operator
+# ============================================================
+input_tensor = torch.randn(2, 4, 8)
+weight = torch.ones(8)
+
+print("\nCalling rmsnorm_fwd:")
+output, rsigma = manager.call("rmsnorm_fwd", input_tensor, weight, eps=1e-5)
+
+print(f"\nInput shape: {input_tensor.shape}")
+print(f"Output shape: {output.shape}")
+print("\nSuccess! Your out-of-tree plugin was loaded and used.")
diff --git a/transformer_engine/plugin/test_utils.py b/transformer_engine/plugin/test_utils.py
new file mode 100644
index 0000000000..8ce836e41e
--- /dev/null
+++ b/transformer_engine/plugin/test_utils.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+import numpy as np
+from typing import List, Dict, Callable, Any, Optional
+
+
+def get_available_backends() -> List[str]:
+    """
+    Get list of available backends by extracting unique impl_ids from OpRegistry.
+
+    Returns impl_id prefixes (e.g., "default.flagos" -> "flagos")
+    """
+    try:
+        from transformer_engine.plugin.core import get_registry
+
+        registry = get_registry()
+        all_impls = []
+        for op_name in registry.list_operators():
+            all_impls.extend(registry.get_implementations(op_name))
+
+        # Extract unique impl_id prefixes (e.g., "default.flagos" -> "flagos")
+        impl_ids = set()
+        for impl in all_impls:
+            # impl_id format: "kind.name" (e.g., "default.flagos", "vendor.cuda")
+            parts = impl.impl_id.split('.', 1)
+            if len(parts) == 2:
+                impl_ids.add(parts[1])  # Get the "name" part
+            else:
+                impl_ids.add(impl.impl_id)
+
+        return sorted(impl_ids)
+    except Exception as e:
+        print(f"Warning: Could not load backends: {e}")
+        import traceback
+        traceback.print_exc()
+        return []
+
+
+def get_backend(name: str):
+    """
+    Get a backend-like object that dispatches to a specific implementation.
+
+    Args:
+        name: Backend name (e.g., "cuda", "flagos", "torch")
+
+    Returns:
+        A wrapper object that calls the specific backend implementation
+    """
+    from transformer_engine.plugin.core import get_registry
+    from transformer_engine.plugin.core.logger_manager import get_logger
+    import functools
+
+    logger = get_logger()
+
+    class BackendWrapper:
+        """Wrapper that calls specific backend implementations"""
+
+        def __init__(self, backend_name: str):
+            self.backend_name = backend_name
+            self.registry = get_registry()
+            self._called_ops = set()  # Track which ops have been called (for logging)
+
+        def _find_impl(self, op_name: str):
+            """Find implementation matching the backend name"""
+            impls = self.registry.get_implementations(op_name)
+
+            # Try to find implementation matching backend_name
+            # Match against impl_id suffix (e.g., "vendor.cuda" matches "cuda")
+            for impl in impls:
+                if impl.impl_id.endswith(f".{self.backend_name}") or impl.impl_id == self.backend_name:
+                    if impl.is_available():
+                        return impl
+                    else:
+                        raise RuntimeError(
+                            f"Implementation '{impl.impl_id}' for op '{op_name}' is not available"
+                        )
+
+            raise NotImplementedError(
+                f"No implementation found for op '{op_name}' with backend '{self.backend_name}'"
+            )
+
+        def __getattr__(self, op_name: str):
+            """Dynamically resolve operator to specific backend implementation"""
+            impl = self._find_impl(op_name)
+
+            # Log on first call to this op for this backend
+            if op_name not in self._called_ops:
+                self._called_ops.add(op_name)
+                logger.info(
+                    f"[Test] Op '{op_name}' using '{impl.impl_id}' "
+                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                )
+
+            return impl.fn
+
+    return BackendWrapper(name)
+
+
+def allclose(a: torch.Tensor, b: torch.Tensor, rtol: float = 1e-5, atol: float = 1e-8) -> bool:
+    return torch.allclose(a, b, rtol=rtol, atol=atol)
+
+
+def compute_relative_error(output: torch.Tensor, reference: torch.Tensor) -> float:
+    diff = (output - reference).abs()
+    relative_error = (diff / (reference.abs() + 1e-10)).mean().item()
+    return relative_error
+
+
+def compute_max_error(output: torch.Tensor, reference: torch.Tensor) -> float:
+    return (output - reference).abs().max().item()
+
+
+class TestCase:
+    def __init__(self, name: str, description: str = ""):
+        self.name = name
+        self.description = description
+        self.passed = 0
+        self.failed = 0
+        self.skipped = 0
+        self.errors: List[str] = []
+
+    def setup(self):
+        pass
+
+    def teardown(self):
+        pass
+
+    def assert_close(
+        self,
+        output: torch.Tensor,
+        reference: torch.Tensor,
+        rtol: float = 1e-5,
+        atol: float = 1e-8,
+        msg: str = "",
+    ):
+        if not allclose(output, reference, rtol, atol):
+            max_err = compute_max_error(output, reference)
+            rel_err = compute_relative_error(output, reference)
+            error_msg = f"{msg}\n  Max error: {max_err:.6e}, Relative error: {rel_err:.6e}"
+            self.errors.append(error_msg)
+            self.failed += 1
+            raise AssertionError(error_msg)
+        self.passed += 1
+
+    def report(self):
+        total = self.passed + self.failed + self.skipped
+        print(f"\n{'='*60}")
+        print(f"Test: {self.name}")
+        if self.description:
+            print(f"Description: {self.description}")
+        print(f"{'='*60}")
+        print(f"Total: {total}, Passed: {self.passed}, Failed: {self.failed}, Skipped: {self.skipped}")
+        if self.errors:
+            print(f"\nErrors:")
+            for i, error in enumerate(self.errors, 1):
+                print(f"  {i}. {error}")
+        print(f"{'='*60}")
+        return self.failed == 0
+
+
+def generate_random_tensor(
+    shape: tuple,
+    dtype: torch.dtype = torch.float32,
+    device: str = "cpu",
+    requires_grad: bool = False,
+) -> torch.Tensor:
+    if dtype in (torch.bfloat16, torch.float16):
+        tensor = torch.randn(shape, dtype=torch.float32, device=device)
+        tensor = tensor.to(dtype=dtype)
+        if requires_grad:
+            tensor.requires_grad_(True)
+    else:
+        tensor = torch.randn(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+    return tensor
+
+
+def generate_test_shapes() -> List[tuple]:
+    return [
+        (2, 4),
+        (8, 16),
+        (32, 64),
+        (2, 4, 8),
+        (4, 8, 16),
+        (2, 4, 8, 16),
+    ]
+
+
+def run_test_on_backends(
+    test_func: Callable,
+    backends: Optional[List[str]] = None,
+    reference_backend: str = "reference",
+) -> Dict[str, bool]:
+    if backends is None:
+        backends = get_available_backends()
+
+    results = {}
+    for backend_name in backends:
+        try:
+            test_func(backend_name)
+            results[backend_name] = True
+            print(f"  ✓ {backend_name}")
+        except Exception as e:
+            results[backend_name] = False
+            print(f"  ✗ {backend_name}: {e}")
+
+    return results
+
+
+def skip_if_backend_unavailable(backend_name: str) -> bool:
+    available = get_available_backends()
+    return backend_name not in available
diff --git a/transformer_engine/plugin/tests/__init__.py b/transformer_engine/plugin/tests/__init__.py
new file mode 100644
index 0000000000..caaec47482
--- /dev/null
+++ b/transformer_engine/plugin/tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+__all__ = []
diff --git a/transformer_engine/plugin/tests/run_all_tests.py b/transformer_engine/plugin/tests/run_all_tests.py
new file mode 100644
index 0000000000..07b8f5032e
--- /dev/null
+++ b/transformer_engine/plugin/tests/run_all_tests.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+import sys
+import torch
+
+from test_activations import ActivationTests
+from test_normalization import NormalizationTests
+from test_operations import OperationsTests
+from test_softmax import SoftmaxTests
+from test_optimizer import OptimizerTests
+from test_flash_attention import FlashAttentionTests
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    print("\n" + "="*70)
+    print(" "*15 + "TEX Interface Backend Tests")
+    print("="*70)
+    print(f"Using device: {device}\n")
+
+    test_suites = [
+        ActivationTests(device=device),
+        NormalizationTests(device=device),
+        OperationsTests(device=device),
+        SoftmaxTests(device=device),
+        OptimizerTests(device=device),
+        FlashAttentionTests(device=device),
+    ]
+
+    results = []
+    for suite in test_suites:
+        success = suite.run_all_tests()
+        results.append((suite.name, success))
+
+    print("\n" + "="*70)
+    print(" "*25 + "Test Summary")
+    print("="*70)
+
+    total_passed = sum(1 for _, success in results if success)
+    total_tests = len(results)
+
+    for name, success in results:
+        status = "✓ PASSED" if success else "✗ FAILED"
+        print(f"  {name:40s} {status}")
+
+    print("="*70)
+    print(f"Total: {total_passed}/{total_tests} test suites passed")
+    print("="*70)
+
+    return 0 if all(success for _, success in results) else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/tests/test_activations.py b/transformer_engine/plugin/tests/test_activations.py
new file mode 100644
index 0000000000..6bf573b7cc
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_activations.py
@@ -0,0 +1,557 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+import torch.nn.functional as F
+import sys
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+    generate_test_shapes,
+)
+
+
+class ActivationTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Activation Functions",
+            "Test correctness of all activation functions across backends"
+        )
+        self.backends = get_available_backends()
+        self.reference_backend = "reference"
+        self.device = device
+
+    # ==================== Reference implementations ====================
+    def _get_reference_gelu(self, x):
+        return F.gelu(x, approximate='tanh')
+
+    def _get_reference_geglu(self, x):
+        a, b = x.chunk(2, dim=-1)
+        return F.gelu(a, approximate='tanh') * b
+
+    def _get_reference_qgelu(self, x):
+        return x * torch.sigmoid(1.702 * x)
+
+    def _get_reference_qgeglu(self, x):
+        a, b = x.chunk(2, dim=-1)
+        return a * torch.sigmoid(1.702 * a) * b
+
+    def _get_reference_relu(self, x):
+        return F.relu(x)
+
+    def _get_reference_reglu(self, x):
+        a, b = x.chunk(2, dim=-1)
+        return F.relu(a) * b
+
+    def _get_reference_srelu(self, x):
+        return torch.square(F.relu(x))
+
+    def _get_reference_sreglu(self, x):
+        a, b = x.chunk(2, dim=-1)
+        return torch.square(F.relu(a)) * b
+
+    def _get_reference_silu(self, x):
+        return F.silu(x)
+
+    def _get_reference_swiglu(self, x):
+        a, b = x.chunk(2, dim=-1)
+        return F.silu(a) * b
+
+    def _get_reference_clamped_swiglu(self, x, limit=7.0, alpha=1.702):
+        """Reference implementation matching CUDA clamped_swiglu.
+
+        CUDA implementation:
+        - a (activation): clamp to upper bound only: min(a, limit)
+        - b (gate): clamp to [-limit, limit], then add 1
+        - output = (a_clamped * sigmoid(alpha * a_clamped)) * b_clamped
+        """
+        a, b = x.chunk(2, dim=-1)
+        # CUDA only clamps a to upper bound
+        a_clamped = torch.clamp(a, max=limit)
+        # CUDA clamps b to [-limit, limit] and adds 1
+        b_clamped = torch.clamp(b, -limit, limit) + 1
+        return a_clamped * torch.sigmoid(alpha * a_clamped) * b_clamped
+
+    # ==================== Forward tests ====================
+    def test_gelu_forward(self, shape=(4, 8)):
+        print(f"\n  Testing GELU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_gelu(x)
+        self._test_activation_forward("gelu", x, reference)
+
+    def test_geglu_forward(self, shape=(4, 16)):
+        print(f"\n  Testing GEGLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_geglu(x)
+        self._test_activation_forward("geglu", x, reference)
+
+    def test_qgelu_forward(self, shape=(4, 8)):
+        print(f"\n  Testing QGELU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_qgelu(x)
+        self._test_activation_forward("qgelu", x, reference)
+
+    def test_qgeglu_forward(self, shape=(4, 16)):
+        print(f"\n  Testing QGEGLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_qgeglu(x)
+        self._test_activation_forward("qgeglu", x, reference)
+
+    def test_relu_forward(self, shape=(4, 8)):
+        print(f"\n  Testing ReLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_relu(x)
+        self._test_activation_forward("relu", x, reference, rtol=1e-6, atol=1e-8)
+
+    def test_reglu_forward(self, shape=(4, 16)):
+        print(f"\n  Testing ReGLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_reglu(x)
+        self._test_activation_forward("reglu", x, reference, rtol=1e-6, atol=1e-8)
+
+    def test_srelu_forward(self, shape=(4, 8)):
+        print(f"\n  Testing SReLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_srelu(x)
+        self._test_activation_forward("srelu", x, reference)
+
+    def test_sreglu_forward(self, shape=(4, 16)):
+        print(f"\n  Testing SReGLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_sreglu(x)
+        self._test_activation_forward("sreglu", x, reference)
+
+    def test_silu_forward(self, shape=(4, 8)):
+        print(f"\n  Testing SiLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_silu(x)
+        self._test_activation_forward("silu", x, reference)
+
+    def test_swiglu_forward(self, shape=(4, 16)):
+        print(f"\n  Testing SwiGLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_swiglu(x)
+        self._test_activation_forward("swiglu", x, reference)
+
+    def test_clamped_swiglu_forward(self, shape=(4, 16)):
+        print(f"\n  Testing Clamped SwiGLU forward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        reference = self._get_reference_clamped_swiglu(x)
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.clamped_swiglu(x, None, 7.0, 1.702)
+                self.assert_close(
+                    output, reference, rtol=1e-4, atol=1e-6,
+                    msg=f"clamped_swiglu forward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def _test_activation_forward(self, op_name, x, reference, rtol=1e-4, atol=1e-6):
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                op_fn = getattr(backend, op_name)
+                output = op_fn(x, None)
+                self.assert_close(
+                    output, reference, rtol=rtol, atol=atol,
+                    msg=f"{op_name} forward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    # ==================== Backward tests ====================
+    def test_gelu_backward(self, shape=(4, 8)):
+        print(f"\n  Testing GELU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        y = self._get_reference_gelu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dgelu", x, grad_output, reference_grad)
+
+    def test_geglu_backward(self, shape=(4, 16)):
+        print(f"\n  Testing GEGLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+                                              dtype=torch.float32, device=self.device)
+        y = self._get_reference_geglu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dgeglu", x, grad_output, reference_grad)
+
+    def test_qgelu_backward(self, shape=(4, 8)):
+        print(f"\n  Testing QGELU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        y = self._get_reference_qgelu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dqgelu", x, grad_output, reference_grad)
+
+    def test_qgeglu_backward(self, shape=(4, 16)):
+        print(f"\n  Testing QGEGLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+                                              dtype=torch.float32, device=self.device)
+        y = self._get_reference_qgeglu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dqgeglu", x, grad_output, reference_grad)
+
+    def test_relu_backward(self, shape=(4, 8)):
+        print(f"\n  Testing ReLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        y = self._get_reference_relu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("drelu", x, grad_output, reference_grad)
+
+    def test_reglu_backward(self, shape=(4, 16)):
+        print(f"\n  Testing ReGLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+                                              dtype=torch.float32, device=self.device)
+        y = self._get_reference_reglu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dreglu", x, grad_output, reference_grad)
+
+    def test_srelu_backward(self, shape=(4, 8)):
+        print(f"\n  Testing SReLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        y = self._get_reference_srelu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dsrelu", x, grad_output, reference_grad)
+
+    def test_sreglu_backward(self, shape=(4, 16)):
+        print(f"\n  Testing SReGLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+                                              dtype=torch.float32, device=self.device)
+        y = self._get_reference_sreglu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dsreglu", x, grad_output, reference_grad)
+
+    def test_silu_backward(self, shape=(4, 8)):
+        print(f"\n  Testing SiLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        y = self._get_reference_silu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dsilu", x, grad_output, reference_grad)
+
+    def test_swiglu_backward(self, shape=(4, 16)):
+        print(f"\n  Testing SwiGLU backward with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+                                              dtype=torch.float32, device=self.device)
+        y = self._get_reference_swiglu(x)
+        y.backward(grad_output)
+        reference_grad = x.grad.clone()
+        x.grad = None
+        self._test_activation_backward("dswiglu", x, grad_output, reference_grad)
+
+    def _test_activation_backward(self, op_name, x, grad_output, reference_grad, rtol=1e-4, atol=1e-6):
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                op_fn = getattr(backend, op_name)
+                grad_input = op_fn(grad_output, x.detach(), None)
+                self.assert_close(
+                    grad_input, reference_grad, rtol=rtol, atol=atol,
+                    msg=f"{op_name} backward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    # ==================== Bias + backward tests ====================
+    def test_dbias_dgelu(self, shape=(4, 8)):
+        print(f"\n  Testing dbias_dgelu with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        # Reference: compute dgelu and sum for bias grad
+        y = self._get_reference_gelu(x)
+        y.backward(grad_output)
+        ref_grad_input = x.grad.clone()
+        ref_grad_bias = grad_output.sum(dim=tuple(range(grad_output.ndim - 1)))
+        x.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                grad_input, grad_bias = backend.dbias_dgelu(grad_output, x.detach(), None)
+                self.assert_close(
+                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dgelu grad_input mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dgelu grad_bias mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except RuntimeError as e:
+                # CUDA requires a valid quantizer for dbias_d* fused ops
+                if "NoneQuantizer does not support" in str(e):
+                    self.skipped += 1
+                    print(f"    ⊘ {backend_name} (requires FP8 quantizer for fused op)")
+                else:
+                    self.failed += 1
+                    print(f"    ✗ {backend_name}: {e}")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_dbias_dsilu(self, shape=(4, 8)):
+        print(f"\n  Testing dbias_dsilu with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        y = self._get_reference_silu(x)
+        y.backward(grad_output)
+        ref_grad_input = x.grad.clone()
+        ref_grad_bias = grad_output.sum(dim=tuple(range(grad_output.ndim - 1)))
+        x.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                grad_input, grad_bias = backend.dbias_dsilu(grad_output, x.detach(), None)
+                self.assert_close(
+                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dsilu grad_input mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dsilu grad_bias mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except RuntimeError as e:
+                # CUDA requires a valid quantizer for dbias_d* fused ops
+                if "NoneQuantizer does not support" in str(e):
+                    self.skipped += 1
+                    print(f"    ⊘ {backend_name} (requires FP8 quantizer for fused op)")
+                else:
+                    self.failed += 1
+                    print(f"    ✗ {backend_name}: {e}")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_dbias_drelu(self, shape=(4, 8)):
+        print(f"\n  Testing dbias_drelu with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        y = self._get_reference_relu(x)
+        y.backward(grad_output)
+        ref_grad_input = x.grad.clone()
+        ref_grad_bias = grad_output.sum(dim=tuple(range(grad_output.ndim - 1)))
+        x.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                grad_input, grad_bias = backend.dbias_drelu(grad_output, x.detach(), None)
+                self.assert_close(
+                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_drelu grad_input mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_drelu grad_bias mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except RuntimeError as e:
+                # CUDA requires a valid quantizer for dbias_d* fused ops
+                if "NoneQuantizer does not support" in str(e):
+                    self.skipped += 1
+                    print(f"    ⊘ {backend_name} (requires FP8 quantizer for fused op)")
+                else:
+                    self.failed += 1
+                    print(f"    ✗ {backend_name}: {e}")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_dbias_dqgelu(self, shape=(4, 8)):
+        print(f"\n  Testing dbias_dqgelu with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        y = self._get_reference_qgelu(x)
+        y.backward(grad_output)
+        ref_grad_input = x.grad.clone()
+        ref_grad_bias = grad_output.sum(dim=tuple(range(grad_output.ndim - 1)))
+        x.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                grad_input, grad_bias = backend.dbias_dqgelu(grad_output, x.detach(), None)
+                self.assert_close(
+                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dqgelu grad_input mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dqgelu grad_bias mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except RuntimeError as e:
+                # CUDA requires a valid quantizer for dbias_d* fused ops
+                if "NoneQuantizer does not support" in str(e):
+                    self.skipped += 1
+                    print(f"    ⊘ {backend_name} (requires FP8 quantizer for fused op)")
+                else:
+                    self.failed += 1
+                    print(f"    ✗ {backend_name}: {e}")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_dbias_dsrelu(self, shape=(4, 8)):
+        print(f"\n  Testing dbias_dsrelu with shape {shape}")
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        y = self._get_reference_srelu(x)
+        y.backward(grad_output)
+        ref_grad_input = x.grad.clone()
+        ref_grad_bias = grad_output.sum(dim=tuple(range(grad_output.ndim - 1)))
+        x.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                grad_input, grad_bias = backend.dbias_dsrelu(grad_output, x.detach(), None)
+                self.assert_close(
+                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dsrelu grad_input mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
+                    msg=f"dbias_dsrelu grad_bias mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except RuntimeError as e:
+                # CUDA requires a valid quantizer for dbias_d* fused ops
+                if "NoneQuantizer does not support" in str(e):
+                    self.skipped += 1
+                    print(f"    ⊘ {backend_name} (requires FP8 quantizer for fused op)")
+                else:
+                    self.failed += 1
+                    print(f"    ✗ {backend_name}: {e}")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def run_all_tests(self):
+        print("\n" + "="*60)
+        print("Testing Activation Functions")
+        print("="*60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        shapes = [(4, 8), (8, 16), (2, 4, 8)]
+        glu_shapes = [(4, 16), (8, 32), (2, 4, 16)]
+
+        # Forward tests - non-gated activations
+        for shape in shapes:
+            self.test_gelu_forward(shape)
+            self.test_qgelu_forward(shape)
+            self.test_relu_forward(shape)
+            self.test_srelu_forward(shape)
+            self.test_silu_forward(shape)
+
+        # Forward tests - gated activations
+        for shape in glu_shapes:
+            self.test_geglu_forward(shape)
+            self.test_qgeglu_forward(shape)
+            self.test_reglu_forward(shape)
+            self.test_sreglu_forward(shape)
+            self.test_swiglu_forward(shape)
+            self.test_clamped_swiglu_forward(shape)
+
+        # Backward tests - non-gated activations
+        for shape in shapes:
+            self.test_gelu_backward(shape)
+            self.test_qgelu_backward(shape)
+            self.test_relu_backward(shape)
+            self.test_srelu_backward(shape)
+            self.test_silu_backward(shape)
+
+        # Backward tests - gated activations
+        for shape in glu_shapes:
+            self.test_geglu_backward(shape)
+            self.test_qgeglu_backward(shape)
+            self.test_reglu_backward(shape)
+            self.test_sreglu_backward(shape)
+            self.test_swiglu_backward(shape)
+
+        # Note: dbias_d* tests are skipped because CUDA requires FP8 quantizer
+        # for these fused ops. These will be tested separately with FP8 quantizer.
+
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    test_suite = ActivationTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/tests/test_flash_attention.py b/transformer_engine/plugin/tests/test_flash_attention.py
new file mode 100644
index 0000000000..4dcb83d36b
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_flash_attention.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import math
+import torch
+import torch.nn.functional as F
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+)
+
+
+class FlashAttentionTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Flash Attention",
+            "Test correctness of Flash Attention implementation across backends"
+        )
+        self.backends = get_available_backends()
+        self.device = device
+
+    def _reference_attention(
+        self,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=None,
+    ):
+        """Reference implementation of scaled dot-product attention
+        Input format: sbhd [seq, batch, heads, dim]
+        """
+        # Convert sbhd to bhsd for computation
+        q = query.permute(1, 2, 0, 3)  # [batch, heads, seq, dim]
+        k = key.permute(1, 2, 0, 3)
+        v = value.permute(1, 2, 0, 3)
+
+        L, S = q.size(-2), k.size(-2)
+        if scale is None:
+            scale_factor = 1 / math.sqrt(q.size(-1))
+        else:
+            scale_factor = scale
+
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+
+        if is_causal:
+            causal_mask = torch.triu(
+                torch.full((L, S), float('-inf'), dtype=q.dtype, device=q.device),
+                diagonal=1
+            )
+            attn_weight = attn_weight + causal_mask
+
+        if attn_mask is not None:
+            attn_weight = attn_weight + attn_mask
+
+        attn_weight = F.softmax(attn_weight, dim=-1)
+
+        if dropout_p > 0.0:
+            attn_weight = F.dropout(attn_weight, p=dropout_p, training=True)
+
+        out = attn_weight @ v
+        # Convert bhsd back to sbhd
+        return out.permute(2, 0, 1, 3)  # [seq, batch, heads, dim]
+
+    def test_flash_attention_forward_basic(self, seq_len=16, batch_size=2, num_heads=4, head_dim=32):
+        """Test basic flash attention forward pass with sbhd layout and bf16"""
+        print(f"\n  Testing Flash Attention forward sbhd bf16 (seq={seq_len}, batch={batch_size}, heads={num_heads}, dim={head_dim})")
+
+        # Shape: (seq_len, batch, num_heads, head_dim) - sbhd layout
+        query = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+        key = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+        value = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+
+        scale = 1.0 / math.sqrt(head_dim)
+
+        # Reference attention (compute in float32 for accuracy)
+        reference = self._reference_attention(
+            query.float(), key.float(), value.float(),
+            scale=scale, is_causal=False
+        ).to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                FlashAttentionClass = backend.get_flash_attention_class()
+                flash_attn = FlashAttentionClass(
+                    softmax_scale=scale,
+                    attention_dropout=0.0,
+                    attention_type="self",
+                    deterministic=True,
+                )
+
+                # Run forward pass with sbhd layout
+                output = flash_attn(
+                    query_layer=query,
+                    key_layer=key,
+                    value_layer=value,
+                    attention_mask=None,
+                    qkv_layout="sb3hd",
+                    attn_mask_type="no_mask",
+                    window_size=(-1, -1),  # Required by flash_attn 2.7+
+                )
+
+                # Output shape: sbhd -> view to sb(h*d)
+                expected_shape = (seq_len, batch_size, num_heads * head_dim)
+                if output.shape != expected_shape:
+                    # Try to reshape reference for comparison
+                    reference_flat = reference.contiguous().reshape(seq_len, batch_size, -1)
+                    self.assert_close(
+                        output.float(), reference_flat.float(), rtol=1e-2, atol=1e-2,
+                        msg=f"Flash Attention forward mismatch for {backend_name}"
+                    )
+                else:
+                    reference_flat = reference.contiguous().reshape(seq_len, batch_size, -1)
+                    self.assert_close(
+                        output.float(), reference_flat.float(), rtol=1e-2, atol=1e-2,
+                        msg=f"Flash Attention forward mismatch for {backend_name}"
+                    )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+                import traceback
+                traceback.print_exc()
+
+    def test_flash_attention_forward_causal(self, seq_len=16, batch_size=2, num_heads=4, head_dim=32):
+        """Test flash attention forward pass with causal mask"""
+        print(f"\n  Testing Flash Attention forward causal sbhd bf16 (seq={seq_len}, batch={batch_size}, heads={num_heads}, dim={head_dim})")
+
+        query = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+        key = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+        value = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+
+        scale = 1.0 / math.sqrt(head_dim)
+
+        # Reference attention with causal mask
+        reference = self._reference_attention(
+            query.float(), key.float(), value.float(),
+            scale=scale, is_causal=True
+        ).to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                FlashAttentionClass = backend.get_flash_attention_class()
+                flash_attn = FlashAttentionClass(
+                    softmax_scale=scale,
+                    attention_dropout=0.0,
+                    attention_type="self",
+                    deterministic=True,
+                )
+
+                output = flash_attn(
+                    query_layer=query,
+                    key_layer=key,
+                    value_layer=value,
+                    attention_mask=None,
+                    qkv_layout="sb3hd",
+                    attn_mask_type="causal",
+                    window_size=(-1, -1),  # Required by flash_attn 2.7+
+                )
+
+                reference_flat = reference.contiguous().reshape(seq_len, batch_size, -1)
+                self.assert_close(
+                    output.float(), reference_flat.float(), rtol=1e-2, atol=1e-2,
+                    msg=f"Flash Attention forward causal mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+                import traceback
+                traceback.print_exc()
+
+    def test_flash_attention_backward(self, seq_len=16, batch_size=2, num_heads=4, head_dim=32):
+        """Test flash attention backward pass with sbhd layout, bf16, and causal mask.
+
+        Note: FlagGems backward currently only supports causal attention.
+        """
+        print(f"\n  Testing Flash Attention backward causal sbhd bf16 (seq={seq_len}, batch={batch_size}, heads={num_heads}, dim={head_dim})")
+
+        query = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
+        key = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
+        value = generate_random_tensor(
+            (seq_len, batch_size, num_heads, head_dim),
+            dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
+        # grad_output shape matches output: sb(h*d)
+        grad_output = generate_random_tensor(
+            (seq_len, batch_size, num_heads * head_dim),
+            dtype=torch.bfloat16, device=self.device
+        )
+
+        scale = 1.0 / math.sqrt(head_dim)
+
+        # Reference backward (compute in float32 for accuracy)
+        # Note: FlagGems backward only supports causal attention
+        query_f32 = query.float().detach().requires_grad_(True)
+        key_f32 = key.float().detach().requires_grad_(True)
+        value_f32 = value.float().detach().requires_grad_(True)
+
+        ref_output = self._reference_attention(query_f32, key_f32, value_f32, scale=scale, is_causal=True)
+        ref_output_flat = ref_output.contiguous().reshape(seq_len, batch_size, -1)
+        ref_output_flat.backward(grad_output.float())
+        ref_grad_q = query_f32.grad.clone().to(torch.bfloat16)
+        ref_grad_k = key_f32.grad.clone().to(torch.bfloat16)
+        ref_grad_v = value_f32.grad.clone().to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                FlashAttentionClass = backend.get_flash_attention_class()
+                flash_attn = FlashAttentionClass(
+                    softmax_scale=scale,
+                    attention_dropout=0.0,
+                    attention_type="self",
+                    deterministic=True,
+                )
+
+                # Forward pass
+                q_copy = query.detach().requires_grad_(True)
+                k_copy = key.detach().requires_grad_(True)
+                v_copy = value.detach().requires_grad_(True)
+
+                output = flash_attn(
+                    query_layer=q_copy,
+                    key_layer=k_copy,
+                    value_layer=v_copy,
+                    attention_mask=None,
+                    qkv_layout="sb3hd",
+                    attn_mask_type="causal",
+                    window_size=(-1, -1),  # Required by flash_attn 2.7+
+                )
+
+                # Backward pass
+                output.backward(grad_output)
+
+                # bf16 backward has higher numerical error due to accumulated precision loss
+                self.assert_close(
+                    q_copy.grad.float(), ref_grad_q.float(), rtol=2e-2, atol=2e-2,
+                    msg=f"Flash Attention backward grad_q mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    k_copy.grad.float(), ref_grad_k.float(), rtol=2e-2, atol=2e-2,
+                    msg=f"Flash Attention backward grad_k mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    v_copy.grad.float(), ref_grad_v.float(), rtol=2e-2, atol=2e-2,
+                    msg=f"Flash Attention backward grad_v mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+                import traceback
+                traceback.print_exc()
+
+    def run_all_tests(self):
+        print("\n" + "="*60)
+        print("Testing Flash Attention")
+        print("="*60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        # Basic forward tests with sbhd layout and bf16
+        self.test_flash_attention_forward_basic(seq_len=16, batch_size=2, num_heads=4, head_dim=32)
+        self.test_flash_attention_forward_basic(seq_len=32, batch_size=4, num_heads=8, head_dim=64)
+
+        # Causal mask tests
+        self.test_flash_attention_forward_causal(seq_len=16, batch_size=2, num_heads=4, head_dim=32)
+
+        # Backward tests
+        self.test_flash_attention_backward(seq_len=16, batch_size=2, num_heads=4, head_dim=32)
+
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    if device != "cuda":
+        print("Warning: Flash Attention tests require CUDA. Skipping.")
+        return 0
+    test_suite = FlashAttentionTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/tests/test_normalization.py b/transformer_engine/plugin/tests/test_normalization.py
new file mode 100644
index 0000000000..6a6114a398
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_normalization.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+import torch.nn.functional as F
+import sys
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+)
+
+
+class NormalizationTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Normalization Functions",
+            "Test correctness of LayerNorm and RMSNorm across backends"
+        )
+        self.backends = get_available_backends()
+        self.eps = 1e-5
+        self.device = device
+
+    def _reference_layernorm_forward(self, x, weight, bias, eps):
+        mean = x.mean(dim=-1, keepdim=True)
+        var = x.var(dim=-1, keepdim=True, unbiased=False)
+        rsigma = torch.rsqrt(var + eps)
+        normalized = (x - mean) * rsigma
+        output = normalized * weight + bias
+        return output, mean.squeeze(-1), rsigma.squeeze(-1)
+
+    def _reference_rmsnorm_forward(self, x, weight, eps):
+        var = (x ** 2).mean(dim=-1, keepdim=True)
+        rsigma = torch.rsqrt(var + eps)
+        normalized = x * rsigma
+        output = normalized * weight
+        return output, None, rsigma.squeeze(-1)
+
+    def test_layernorm_forward(self, shape=(2, 4, 8)):
+        print(f"\n  Testing LayerNorm forward with shape {shape}")
+
+        hidden_size = shape[-1]
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        weight = torch.ones(hidden_size, dtype=torch.float32, device=self.device)
+        bias = torch.zeros(hidden_size, dtype=torch.float32, device=self.device)
+
+        ref_output, ref_mean, ref_rsigma = self._reference_layernorm_forward(
+            x, weight, bias, self.eps
+        )
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output, mean, rsigma = backend.layernorm_fwd(
+                    x, weight, bias, self.eps,
+                    None, None, torch.float32, 0, False
+                )
+                self.assert_close(
+                    output, ref_output, rtol=1e-5, atol=1e-7,
+                    msg=f"LayerNorm forward output mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    mean, ref_mean, rtol=1e-5, atol=1e-7,
+                    msg=f"LayerNorm forward mean mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    rsigma, ref_rsigma, rtol=1e-4, atol=1e-6,
+                    msg=f"LayerNorm forward rsigma mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_layernorm_backward(self, shape=(2, 4, 8)):
+        print(f"\n  Testing LayerNorm backward with shape {shape}")
+
+        hidden_size = shape[-1]
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        weight = torch.ones(hidden_size, dtype=torch.float32, device=self.device, requires_grad=True)
+        bias = torch.zeros(hidden_size, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        output, mean, rsigma = self._reference_layernorm_forward(x, weight, bias, self.eps)
+        output.backward(grad_output)
+        ref_grad_x = x.grad.clone()
+        ref_grad_weight = weight.grad.clone()
+        ref_grad_bias = bias.grad.clone()
+
+        x.grad = None
+        weight.grad = None
+        bias.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                x_copy = x.detach()
+                weight_copy = weight.detach()
+
+                grad_x, grad_weight, grad_bias = backend.layernorm_bwd(
+                    grad_output, x_copy, mean.detach(), rsigma.detach(),
+                    weight_copy, 0, False
+                )
+
+                self.assert_close(
+                    grad_x, ref_grad_x, rtol=1e-4, atol=1e-6,
+                    msg=f"LayerNorm backward grad_x mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_weight, ref_grad_weight, rtol=1e-4, atol=1e-6,
+                    msg=f"LayerNorm backward grad_weight mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-5,
+                    msg=f"LayerNorm backward grad_bias mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_rmsnorm_forward(self, shape=(2, 4, 8)):
+        print(f"\n  Testing RMSNorm forward with shape {shape}")
+
+        hidden_size = shape[-1]
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        weight = torch.ones(hidden_size, dtype=torch.float32, device=self.device)
+
+        ref_output, _, ref_rsigma = self._reference_rmsnorm_forward(x, weight, self.eps)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output, _, rsigma = backend.rmsnorm_fwd(
+                    x, weight, self.eps,
+                    None, None, torch.float32, 0, False
+                )
+                self.assert_close(
+                    output, ref_output, rtol=1e-5, atol=1e-7,
+                    msg=f"RMSNorm forward output mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    rsigma, ref_rsigma, rtol=1e-4, atol=1e-6,
+                    msg=f"RMSNorm forward rsigma mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_rmsnorm_backward(self, shape=(2, 4, 8)):
+        print(f"\n  Testing RMSNorm backward with shape {shape}")
+
+        hidden_size = shape[-1]
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        weight = torch.ones(hidden_size, dtype=torch.float32, device=self.device, requires_grad=True)
+        grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+
+        output, _, rsigma = self._reference_rmsnorm_forward(x, weight, self.eps)
+        output.backward(grad_output)
+        ref_grad_x = x.grad.clone()
+        ref_grad_weight = weight.grad.clone()
+
+        x.grad = None
+        weight.grad = None
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                x_copy = x.detach()
+                weight_copy = weight.detach()
+
+                grad_x, grad_weight = backend.rmsnorm_bwd(
+                    grad_output, x_copy, rsigma.detach(),
+                    weight_copy, 0, False, self.eps
+                )
+
+                self.assert_close(
+                    grad_x, ref_grad_x, rtol=1e-4, atol=1e-6,
+                    msg=f"RMSNorm backward grad_x mismatch for {backend_name}"
+                )
+                self.assert_close(
+                    grad_weight, ref_grad_weight, rtol=1e-4, atol=1e-6,
+                    msg=f"RMSNorm backward grad_weight mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def run_all_tests(self):
+        print("\n" + "="*60)
+        print("Testing Normalization Functions")
+        print("="*60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        shapes = [
+            (8, 16),
+            (32, 64),
+            (64, 128),
+            (16, 256),
+        ]
+
+        for shape in shapes:
+            self.test_layernorm_forward(shape)
+            self.test_layernorm_backward(shape)
+            self.test_rmsnorm_forward(shape)
+            self.test_rmsnorm_backward(shape)
+
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    test_suite = NormalizationTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/tests/test_operations.py b/transformer_engine/plugin/tests/test_operations.py
new file mode 100644
index 0000000000..0d64c7e753
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_operations.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import torch
+import torch.nn.functional as F
+import sys
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+)
+
+
+class OperationsTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Operations (GEMM, Softmax, Dropout)",
+            "Test correctness of GEMM, Softmax, and Dropout operations"
+        )
+        self.backends = get_available_backends()
+        self.device = device
+
+    def test_gemm_basic(self, M=32, N=64, K=48):
+        print(f"\n  Testing GEMM ({M}x{K}) @ ({K}x{N})")
+
+        A = generate_random_tensor((K, N), dtype=torch.float32, device=self.device)
+        B = generate_random_tensor((M, K), dtype=torch.float32, device=self.device)
+        reference = B @ A
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                D = torch.empty((M, N), dtype=torch.float32, device=self.device)
+                workspace = torch.empty(1024, dtype=torch.uint8, device=self.device)
+
+                output, _, _, _ = backend.generic_gemm(
+                    A, False, B, False, D,
+                    None, torch.float32, None, None,
+                    False, None, False,
+                    workspace, 1024, False, False
+                )
+
+                self.assert_close(
+                    output, reference, rtol=5e-2, atol=1e-2,
+                    msg=f"GEMM output mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_gemm_transpose_a(self, M=32, N=64, K=48):
+        print(f"\n  Testing GEMM transpose A ({N}x{K}).T @ ({M}x{K})")
+
+        A = generate_random_tensor((N, K), dtype=torch.float32, device=self.device)
+        B = generate_random_tensor((M, K), dtype=torch.float32, device=self.device)
+        reference = B @ A.T
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                D = torch.empty((M, N), dtype=torch.float32, device=self.device)
+                workspace = torch.empty(1024, dtype=torch.uint8, device=self.device)
+
+                output, _, _, _ = backend.generic_gemm(
+                    A, True, B, False, D,
+                    None, torch.float32, None, None,
+                    False, None, False,
+                    workspace, 1024, False, False
+                )
+
+                self.assert_close(
+                    output, reference, rtol=5e-2, atol=1e-2,
+                    msg=f"GEMM transpose A mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_gemm_3d(self, B=2, M=16, N=32, K=24):
+        print(f"\n  Testing 3D GEMM ({B}x{M}x{K}) @ ({K}x{N})")
+
+        A = generate_random_tensor((B, M, K), dtype=torch.float32, device=self.device)
+        B_mat = generate_random_tensor((K, N), dtype=torch.float32, device=self.device)
+        reference = torch.matmul(A, B_mat)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                D = torch.empty((B, M, N), dtype=torch.float32, device=self.device)
+                workspace = torch.empty(1024, dtype=torch.uint8, device=self.device)
+
+                output, _, _, _ = backend.generic_gemm(
+                    B_mat, False, A, False, D,
+                    None, torch.float32, None, None,
+                    False, None, False,
+                    workspace, 1024, False, False
+                )
+
+                self.assert_close(
+                    output, reference, rtol=5e-2, atol=1e-2,
+                    msg=f"3D GEMM mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_softmax(self, shape=(2, 4, 8, 16)):
+        print(f"\n  Testing scaled softmax with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+        scale = 0.125
+        reference = F.softmax(x.float() * scale, dim=-1).to(x.dtype)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.scaled_softmax_forward(x, scale)
+                self.assert_close(
+                    output, reference, rtol=1e-2, atol=1e-3,
+                    msg=f"Scaled softmax mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_causal_masked_softmax(self, shape=(8, 16, 16)):
+        print(f"\n  Testing causal masked softmax with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+        scale = 0.125
+        seq_len = shape[-1]
+
+        causal_mask = torch.triu(
+            torch.full((seq_len, seq_len), float('-inf'), dtype=x.dtype, device=self.device),
+            diagonal=1
+        )
+        reference = F.softmax(x.float() * scale + causal_mask.float(), dim=-1).to(x.dtype)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.scaled_upper_triang_masked_softmax_forward(x, scale)
+                self.assert_close(
+                    output, reference, rtol=1e-2, atol=1e-3,
+                    msg=f"Causal masked softmax mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_dropout(self, shape=(4, 8, 16)):
+        print(f"\n  Testing dropout with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+        dropout_prob = 0.1
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output, mask = backend.dropout_fwd(x, dropout_prob)
+
+                num_nonzero = (output != 0).sum().item()
+                total_elements = output.numel()
+                nonzero_ratio = num_nonzero / total_elements
+                expected_ratio = 1.0 - dropout_prob
+
+                assert abs(nonzero_ratio - expected_ratio) < 0.2, \
+                    f"Dropout ratio mismatch for {backend_name}: {nonzero_ratio:.3f} vs {expected_ratio:.3f}"
+
+                assert torch.all(output[output == 0] == 0), \
+                    f"Dropped elements should be zero for {backend_name}"
+
+                expected_scale = 1.0 / (1.0 - dropout_prob)
+                non_zero_output = output[output != 0]
+                non_zero_input = x[output != 0]
+
+                if len(non_zero_output) > 0:
+                    self.assert_close(
+                        non_zero_output, non_zero_input * expected_scale,
+                        rtol=1e-2, atol=1e-3,
+                        msg=f"Dropout scaling mismatch for {backend_name}"
+                    )
+
+                grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+                grad_input = backend.dropout_bwd(grad_output, mask, dropout_prob)
+
+                grad_nonzero_mask = (grad_input != 0)
+                output_nonzero_mask = (output != 0)
+                assert torch.all(grad_nonzero_mask == output_nonzero_mask), \
+                    f"Dropout backward sparsity mismatch for {backend_name}"
+
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def run_all_tests(self):
+        print("\n" + "="*60)
+        print("Testing Operations (GEMM, Softmax, Dropout)")
+        print("="*60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        self.test_gemm_basic(M=32, N=64, K=48)
+        self.test_gemm_basic(M=64, N=128, K=96)
+        self.test_gemm_transpose_a(M=32, N=64, K=48)
+        self.test_gemm_3d(B=2, M=16, N=32, K=24)
+
+        self.test_scaled_softmax((4, 8, 16, 16))
+        self.test_scaled_softmax((2, 4, 32, 32))
+        self.test_causal_masked_softmax((16, 32, 32))
+        self.test_causal_masked_softmax((8, 64, 64))
+
+        self.test_dropout((4, 8, 16))
+        self.test_dropout((8, 16, 32))
+
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    test_suite = OperationsTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/tests/test_optimizer.py b/transformer_engine/plugin/tests/test_optimizer.py
new file mode 100644
index 0000000000..d4f72919ef
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_optimizer.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+import math
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+)
+
+
+class OptimizerTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Optimizer Operations",
+            "Test correctness of multi_tensor optimizer operations across backends"
+        )
+        self.backends = get_available_backends()
+        self.device = device
+
+    def _reference_multi_tensor_l2norm(self, tensors, per_tensor=False):
+        """Reference implementation for multi_tensor_l2norm"""
+        if per_tensor:
+            return [torch.norm(t.float(), p=2) for t in tensors]
+        else:
+            total_norm_sq = sum(torch.norm(t.float(), p=2) ** 2 for t in tensors)
+            return torch.sqrt(total_norm_sq)
+
+    def test_multi_tensor_scale(self, num_tensors=4, shape=(64, 128)):
+        print(f"\n  Testing multi_tensor_scale with {num_tensors} tensors of shape {shape}")
+
+        scale = 0.5
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Create input tensors
+                input_tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                                 for _ in range(num_tensors)]
+                # Create output tensors (will be filled by the function)
+                output_tensors = [torch.empty_like(t) for t in input_tensors]
+                # Create reference tensors
+                ref_tensors = [t.clone() * scale for t in input_tensors]
+
+                # Apply backend scaling: tensor_lists = [input_tensors, output_tensors]
+                noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
+                backend.multi_tensor_scale(
+                    chunk_size=2048,
+                    noop_flag=noop_flag,
+                    tensor_lists=[input_tensors, output_tensors],
+                    scale=scale
+                )
+
+                # Compare results
+                for i, (output, reference) in enumerate(zip(output_tensors, ref_tensors)):
+                    self.assert_close(
+                        output, reference, rtol=1e-5, atol=1e-7,
+                        msg=f"multi_tensor_scale tensor {i} mismatch for {backend_name}"
+                    )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_multi_tensor_l2norm(self, num_tensors=4, shape=(64, 128)):
+        print(f"\n  Testing multi_tensor_l2norm with {num_tensors} tensors of shape {shape}")
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                           for _ in range(num_tensors)]
+
+                # Reference computation
+                ref_norm = self._reference_multi_tensor_l2norm(tensors, per_tensor=False)
+
+                # Backend computation
+                noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
+                output_norm = backend.multi_tensor_l2norm(
+                    chunk_size=2048,
+                    noop_flag=noop_flag,
+                    tensor_lists=[tensors],
+                    per_tensor=False
+                )
+
+                # CUDA backend returns tuple (norm, per_tensor_norms), extract the first element
+                if isinstance(output_norm, tuple):
+                    output_norm = output_norm[0]
+
+                self.assert_close(
+                    output_norm, ref_norm, rtol=1e-4, atol=1e-6,
+                    msg=f"multi_tensor_l2norm total norm mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_multi_tensor_l2norm_per_tensor(self, num_tensors=4, shape=(64, 128)):
+        print(f"\n  Testing multi_tensor_l2norm per_tensor with {num_tensors} tensors of shape {shape}")
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                           for _ in range(num_tensors)]
+
+                # Reference computation
+                ref_norms = self._reference_multi_tensor_l2norm(tensors, per_tensor=True)
+
+                # Backend computation
+                noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
+                output_norms = backend.multi_tensor_l2norm(
+                    chunk_size=2048,
+                    noop_flag=noop_flag,
+                    tensor_lists=[tensors],
+                    per_tensor=True
+                )
+
+                # CUDA backend returns tuple (total_norm, per_tensor_norms), extract second element
+                if isinstance(output_norms, tuple):
+                    output_norms = output_norms[1]
+
+                for i, (output, reference) in enumerate(zip(output_norms, ref_norms)):
+                    self.assert_close(
+                        output, reference, rtol=1e-4, atol=1e-6,
+                        msg=f"multi_tensor_l2norm per_tensor {i} mismatch for {backend_name}"
+                    )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_multi_tensor_adam(self, num_tensors=3, shape=(32, 64)):
+        print(f"\n  Testing multi_tensor_adam with {num_tensors} tensors of shape {shape}")
+
+        lr = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        eps = 1e-8
+        step = 1
+        weight_decay = 0.01
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Create tensors for backend test
+                params = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                          for _ in range(num_tensors)]
+                grads = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                         for _ in range(num_tensors)]
+                exp_avgs = [torch.zeros_like(p) for p in params]
+                exp_avg_sqs = [torch.zeros_like(p) for p in params]
+
+                # Create reference tensors with same values
+                ref_params = [p.clone() for p in params]
+                ref_grads = [g.clone() for g in grads]
+                ref_exp_avgs = [torch.zeros_like(p) for p in params]
+                ref_exp_avg_sqs = [torch.zeros_like(p) for p in params]
+
+                # Apply reference Adam step (matching the torch implementation)
+                bias_correction1 = 1 - beta1 ** step
+                bias_correction2 = 1 - beta2 ** step
+
+                for p, g, m, v in zip(ref_params, ref_grads, ref_exp_avgs, ref_exp_avg_sqs):
+                    # AdamW style: weight decay applied to param first
+                    p.mul_(1 - lr * weight_decay)
+
+                    # Update biased first moment estimate
+                    m.mul_(beta1).add_(g, alpha=1 - beta1)
+                    # Update biased second raw moment estimate
+                    v.mul_(beta2).addcmul_(g, g, value=1 - beta2)
+
+                    # Compute bias-corrected estimates
+                    corrected_m = m / bias_correction1
+                    corrected_v = v / bias_correction2
+
+                    # Update parameters
+                    denom = corrected_v.sqrt().add_(eps)
+                    p.addcdiv_(corrected_m, denom, value=-lr)
+
+                # Apply backend Adam step
+                noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
+                backend.multi_tensor_adam(
+                    chunk_size=2048,
+                    noop_flag=noop_flag,
+                    tensor_lists=[grads, params, exp_avgs, exp_avg_sqs],
+                    lr=lr,
+                    beta1=beta1,
+                    beta2=beta2,
+                    eps=eps,
+                    step=step,
+                    mode=1,  # AdamW mode
+                    bias_correction=1,
+                    weight_decay=weight_decay
+                )
+
+                # Compare results with relaxed tolerance
+                for i, (output, reference) in enumerate(zip(params, ref_params)):
+                    self.assert_close(
+                        output, reference, rtol=1e-3, atol=1e-5,
+                        msg=f"multi_tensor_adam param {i} mismatch for {backend_name}"
+                    )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def _reference_multi_tensor_unscale_l2norm(self, tensors, inv_scale, per_tensor=False):
+        """Reference implementation for multi_tensor_unscale_l2norm.
+
+        Computes L2 norm of tensors after unscaling.
+        Note: scale parameter is actually inv_scale (1/loss_scale).
+        Unscaling means multiplying by inv_scale (= dividing by loss_scale).
+        """
+        inv_scale_value = inv_scale.item() if isinstance(inv_scale, torch.Tensor) else inv_scale
+        # Unscale (multiply by inv_scale) and compute L2 norm
+        if per_tensor:
+            return [torch.norm(t.float() * inv_scale_value, p=2) for t in tensors]
+        else:
+            total_norm_sq = sum(torch.norm(t.float() * inv_scale_value, p=2) ** 2 for t in tensors)
+            return torch.sqrt(total_norm_sq)
+
+    def test_multi_tensor_unscale_l2norm(self, num_tensors=4, shape=(64, 128)):
+        print(f"\n  Testing multi_tensor_unscale_l2norm with {num_tensors} tensors of shape {shape}")
+
+        # Note: scale parameter is actually inv_scale (1/loss_scale)
+        # For AMP with loss_scale=1024, inv_scale would be 1/1024
+        inv_scale_value = 0.5  # equivalent to loss_scale = 2.0
+        tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                   for _ in range(num_tensors)]
+        noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
+        inv_scale = torch.tensor([inv_scale_value], dtype=torch.float32, device=self.device)
+
+        # Compute mathematical reference
+        reference_norm = self._reference_multi_tensor_unscale_l2norm(tensors, inv_scale, per_tensor=False)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output_norm = backend.multi_tensor_unscale_l2norm(
+                    chunk_size=2048,
+                    noop_flag=noop_flag,
+                    tensor_lists=[tensors],
+                    scale=inv_scale,
+                    per_tensor=False
+                )
+
+                # CUDA backend returns tuple (norm, per_tensor_norms), extract the first element
+                if isinstance(output_norm, tuple):
+                    output_norm = output_norm[0]
+
+                self.assert_close(
+                    output_norm, reference_norm, rtol=1e-4, atol=1e-6,
+                    msg=f"multi_tensor_unscale_l2norm mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def run_all_tests(self):
+        print("\n" + "="*60)
+        print("Testing Optimizer Operations")
+        print("="*60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        # multi_tensor_scale tests
+        self.test_multi_tensor_scale(num_tensors=4, shape=(64, 128))
+        self.test_multi_tensor_scale(num_tensors=8, shape=(128, 256))
+
+        # multi_tensor_l2norm tests
+        self.test_multi_tensor_l2norm(num_tensors=4, shape=(64, 128))
+        self.test_multi_tensor_l2norm_per_tensor(num_tensors=4, shape=(64, 128))
+
+        # multi_tensor_unscale_l2norm tests
+        self.test_multi_tensor_unscale_l2norm(num_tensors=4, shape=(64, 128))
+
+        # multi_tensor_adam tests
+        self.test_multi_tensor_adam(num_tensors=3, shape=(32, 64))
+
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    test_suite = OptimizerTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugin/tests/test_softmax.py b/transformer_engine/plugin/tests/test_softmax.py
new file mode 100644
index 0000000000..f1272a4773
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_softmax.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+import torch.nn.functional as F
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+)
+
+
+class SoftmaxTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Softmax Operations",
+            "Test correctness of all softmax operations across backends"
+        )
+        self.backends = get_available_backends()
+        self.device = device
+
+    def test_scaled_softmax_forward(self, shape=(2, 4, 8, 16)):
+        print(f"\n  Testing scaled softmax forward with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+        scale = 0.125
+        reference = F.softmax(x.float() * scale, dim=-1).to(x.dtype)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.scaled_softmax_forward(x, scale)
+                self.assert_close(
+                    output, reference, rtol=1e-2, atol=1e-3,
+                    msg=f"Scaled softmax forward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_softmax_backward(self, shape=(2, 4, 8, 16)):
+        print(f"\n  Testing scaled softmax backward with shape {shape}")
+
+        # Use bf16 for all computation to match backend precision
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        scale = 0.125
+        grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+
+        # Compute reference gradient using autograd (in float32 for precision, then convert)
+        x_f32 = x.float().detach().requires_grad_(True)
+        softmax_output_f32 = F.softmax(x_f32 * scale, dim=-1)
+        loss = (softmax_output_f32 * grad_output.float()).sum()
+        loss.backward()
+        reference_grad = x_f32.grad.clone()
+
+        # Get softmax output in bf16 for backend
+        softmax_out_test = softmax_output_f32.detach().to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Clone inputs as some backends may modify them in-place
+                grad_input = backend.scaled_softmax_backward(
+                    grad_output.clone(), softmax_out_test.clone(), scale
+                )
+                self.assert_close(
+                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
+                    msg=f"Scaled softmax backward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_masked_softmax_forward(self, shape=(2, 4, 8, 16)):
+        print(f"\n  Testing scaled masked softmax forward with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+        scale = 0.125
+
+        # Create boolean mask and corresponding masks
+        batch = shape[0]
+        seq_q, seq_k = shape[-2], shape[-1]
+        bool_mask = torch.rand((batch, 1, seq_q, seq_k), device=self.device) > 0.5
+
+        # CUDA uses uint8 mask (1=masked, 0=unmasked)
+        uint8_mask = bool_mask.to(torch.uint8)
+
+        # Additive mask for reference computation
+        additive_mask = torch.zeros((batch, 1, seq_q, seq_k), dtype=x.dtype, device=self.device)
+        additive_mask = additive_mask.masked_fill(bool_mask, float('-inf'))
+        additive_mask_expanded = additive_mask.expand(shape)
+
+        # Reference: F.softmax(x * scale + additive_mask, dim=-1)
+        reference = F.softmax(x * scale + additive_mask_expanded, dim=-1)
+
+        # Use bf16 for all backends
+        x_test = x.to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.scaled_masked_softmax_forward(x_test, uint8_mask, scale)
+                self.assert_close(
+                    output.float(), reference.float(), rtol=1e-2, atol=1e-3,
+                    msg=f"Scaled masked softmax forward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_masked_softmax_backward(self, shape=(2, 4, 8, 16)):
+        print(f"\n  Testing scaled masked softmax backward with shape {shape}")
+
+        # Use bf16 for all computation
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        scale = 0.125
+        grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+
+        # Compute reference gradient using autograd (in float32 for precision)
+        x_f32 = x.float().detach().requires_grad_(True)
+        softmax_output_f32 = F.softmax(x_f32 * scale, dim=-1)
+        loss = (softmax_output_f32 * grad_output.float()).sum()
+        loss.backward()
+        reference_grad = x_f32.grad.clone()
+
+        # Get softmax output in bf16 for backend
+        softmax_out_test = softmax_output_f32.detach().to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Clone inputs as some backends may modify them in-place
+                grad_input = backend.scaled_masked_softmax_backward(
+                    grad_output.clone(), softmax_out_test.clone(), scale
+                )
+                self.assert_close(
+                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
+                    msg=f"Scaled masked softmax backward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_upper_triang_masked_softmax_forward(self, shape=(8, 16, 16)):
+        print(f"\n  Testing scaled upper triang masked softmax forward with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+        scale = 0.125
+        seq_len = shape[-1]
+
+        causal_mask = torch.triu(
+            torch.full((seq_len, seq_len), float('-inf'), dtype=x.dtype, device=self.device),
+            diagonal=1
+        )
+        reference = F.softmax(x.float() * scale + causal_mask.float(), dim=-1).to(x.dtype)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.scaled_upper_triang_masked_softmax_forward(x, scale)
+                self.assert_close(
+                    output, reference, rtol=1e-2, atol=1e-3,
+                    msg=f"Scaled upper triang masked softmax forward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_upper_triang_masked_softmax_backward(self, shape=(8, 16, 16)):
+        print(f"\n  Testing scaled upper triang masked softmax backward with shape {shape}")
+
+        # Use bf16 for all computation
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        scale = 0.125
+        seq_len = shape[-1]
+        grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+
+        causal_mask = torch.triu(
+            torch.full((seq_len, seq_len), float('-inf'), dtype=torch.float32, device=self.device),
+            diagonal=1
+        )
+
+        # Compute reference gradient using autograd (in float32 for precision)
+        x_f32 = x.float().detach().requires_grad_(True)
+        softmax_output_f32 = F.softmax(x_f32 * scale + causal_mask, dim=-1)
+        loss = (softmax_output_f32 * grad_output.float()).sum()
+        loss.backward()
+        reference_grad = x_f32.grad.clone()
+
+        # Get softmax output in bf16 for backend
+        softmax_out_test = softmax_output_f32.detach().to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Clone inputs as some backends may modify them in-place
+                grad_input = backend.scaled_upper_triang_masked_softmax_backward(
+                    grad_output.clone(), softmax_out_test.clone(), scale
+                )
+                self.assert_close(
+                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
+                    msg=f"Scaled upper triang masked softmax backward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_aligned_causal_masked_softmax_forward(self, shape=(2, 4, 16, 16)):
+        """Test scaled aligned causal masked softmax forward.
+
+        Note: CUDA backend requires 4D tensor (batch, heads, seq, seq).
+        """
+        print(f"\n  Testing scaled aligned causal masked softmax forward with shape {shape}")
+
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+        scale = 0.125
+        seq_len = shape[-1]
+
+        # Aligned causal mask (lower triangular)
+        causal_mask = torch.triu(
+            torch.full((seq_len, seq_len), float('-inf'), dtype=x.dtype, device=self.device),
+            diagonal=1
+        )
+        reference = F.softmax(x.float() * scale + causal_mask.float(), dim=-1).to(x.dtype)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                output = backend.scaled_aligned_causal_masked_softmax_forward(x, scale)
+                self.assert_close(
+                    output, reference, rtol=1e-2, atol=1e-3,
+                    msg=f"Scaled aligned causal masked softmax forward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def test_scaled_aligned_causal_masked_softmax_backward(self, shape=(2, 4, 16, 16)):
+        """Test scaled aligned causal masked softmax backward.
+
+        Note: All backends use bf16 for consistency.
+        """
+        print(f"\n  Testing scaled aligned causal masked softmax backward with shape {shape}")
+
+        # Use bf16 for all computation
+        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        scale = 0.125
+        seq_len = shape[-1]
+        grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+
+        causal_mask = torch.triu(
+            torch.full((seq_len, seq_len), float('-inf'), dtype=torch.float32, device=self.device),
+            diagonal=1
+        )
+
+        # Compute reference gradient using autograd (in float32 for precision)
+        x_f32 = x.float().detach().requires_grad_(True)
+        softmax_output_f32 = F.softmax(x_f32 * scale + causal_mask, dim=-1)
+        loss = (softmax_output_f32 * grad_output.float()).sum()
+        loss.backward()
+        reference_grad = x_f32.grad.clone()
+
+        # Get softmax output in bf16 for backend
+        softmax_out_test = softmax_output_f32.detach().to(torch.bfloat16)
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Clone inputs as some backends may modify them in-place
+                grad_input = backend.scaled_aligned_causal_masked_softmax_backward(
+                    grad_output.clone(), softmax_out_test.clone(), scale
+                )
+                self.assert_close(
+                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
+                    msg=f"Scaled aligned causal masked softmax backward mismatch for {backend_name}"
+                )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
+    def run_all_tests(self):
+        print("\n" + "="*60)
+        print("Testing Softmax Operations")
+        print("="*60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        # Scaled softmax tests
+        self.test_scaled_softmax_forward((4, 8, 16, 16))
+        self.test_scaled_softmax_forward((2, 4, 32, 32))
+        self.test_scaled_softmax_backward((4, 8, 16, 16))
+        self.test_scaled_softmax_backward((2, 4, 32, 32))
+
+        # Masked softmax tests
+        self.test_scaled_masked_softmax_forward((4, 8, 16, 16))
+        self.test_scaled_masked_softmax_backward((4, 8, 16, 16))
+
+        # Upper triangular (causal) masked softmax tests
+        self.test_scaled_upper_triang_masked_softmax_forward((16, 32, 32))
+        self.test_scaled_upper_triang_masked_softmax_forward((8, 64, 64))
+        self.test_scaled_upper_triang_masked_softmax_backward((16, 32, 32))
+        self.test_scaled_upper_triang_masked_softmax_backward((8, 64, 64))
+
+        # Aligned causal masked softmax tests (4D tensor required by CUDA)
+        self.test_scaled_aligned_causal_masked_softmax_forward((2, 4, 32, 32))
+        self.test_scaled_aligned_causal_masked_softmax_backward((2, 4, 32, 32))
+
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    test_suite = SoftmaxTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/plugins/backend.py b/transformer_engine/plugins/backend.py
deleted file mode 100644
index 6a3e9a589a..0000000000
--- a/transformer_engine/plugins/backend.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import os
-import torch
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from .register import get_backend, get_selected_backend, register_backend
-from .logger import get_logger
-logger = get_logger()
-
-from .import_utils import have_flag_gems
-
-HAVE_FLAG_GEMS = have_flag_gems()
-
-class BackendDispatch:
-    """
-    Transformer Engine Backend that routes operations to appropriate implementations.
-    
-    Uses caching to avoid repeated flag checks and backend lookups for the same operation.
-    """
-    
-    def __init__(self):
-        """Initialize the backend with an empty implementation cache."""
-        # Cache for operation implementations: {operation: impl}
-        self._impl_cache: Dict[str, Any] = {}
-    
-    def _get_impl(self, operation: str):
-        """
-        Get the implementation for an operation based on flags.
-        Falls back to native if the selected backend doesn't have the operation.
-        Uses caching to avoid repeated lookups.
-        
-        Args:
-            operation: Name of the operation (e.g., "gemm", "rmsnorm_fwd")
-        
-        Returns:
-            The implementation function/class to use
-        
-        Raises:
-            RuntimeError: If native backend doesn't have the operation
-        """
-        # Check cache first
-        if operation in self._impl_cache:
-            return self._impl_cache[operation]
-        
-        # Get selected backend based on global environment variable
-        selected_backend = get_selected_backend()
-        native_backend = get_backend("native")
-        
-        # Try to get implementation from selected backend, fallback to native if not found
-        impl = selected_backend.get(operation)
-        if impl is None:
-            logger.debug(
-                f"Backend '{selected_backend.name}' doesn't have '{operation}', "
-                f"falling back to native"
-            )
-            impl = native_backend.get(operation)
-            if impl is None:
-                raise RuntimeError(
-                    f"Operation '{operation}' is not registered in native backend. "
-                    f"Available operations: {sorted(native_backend._implementations.keys())}"
-                )
-        
-        # Cache the implementation for future use
-        logger.info(f"Backend '{selected_backend.name}' use implementation of '{operation}' for training")
-        self._impl_cache[operation] = impl
-        
-        return impl
-    
-    def _reset_cache_to_native(self, operation: str):
-        # Check cache first
-        if operation in self._impl_cache:
-            # Get native backend
-            native_backend = get_backend("native")
-            impl = native_backend.get(operation)
-            if impl is None:
-                raise RuntimeError(
-                    f"Operation '{operation}' is not registered in native backend. "
-                    f"Available operations: {sorted(native_backend._implementations.keys())}"
-                )
-            # Cache the implementation for future use
-            self._impl_cache[operation] = impl
-
-    def clear_cache(self):
-        """Clear the implementation cache. Useful if flags change at runtime."""
-        self._impl_cache.clear()
-        logger.debug("Cleared implementation cache")
-
-    def gemm(self, *args, **kwargs):
-        """GEMM operation with automatic fallback to native."""
-        impl = self._get_impl("gemm")
-        try:
-            return impl(*args, **kwargs)
-        except Exception as e:
-            logger.warning(f"GEMM implementation failed, falling back to native: {e}")
-            self._reset_cache_to_native("gemm")
-            native_backend = get_backend("native")
-            return native_backend.get("gemm")(*args, **kwargs)
-    
-    def apply_normalization(self, *args, **kwargs):
-        """Apply normalization with automatic fallback to native."""
-        impl = self._get_impl("apply_normalization")
-        try:
-            return impl(*args, **kwargs)
-        except Exception as e:
-            logger.warning(f"Apply Normalization implementation failed, falling back to native: {e}")
-            self._reset_cache_to_native("apply_normalization")
-            native_backend = get_backend("native")
-            return native_backend.get("apply_normalization")(*args, **kwargs)
-    
-    def rmsnorm_fwd(self, *args, **kwargs):
-        """RMSNorm forward pass with automatic fallback to native."""
-        impl = self._get_impl("rmsnorm_fwd")
-        try:
-            return impl(*args, **kwargs)
-        except Exception as e:
-            logger.warning(f"RmsNorm FWD implementation failed, falling back to native: {e}")
-            self._reset_cache_to_native("rmsnorm_fwd")
-            native_backend = get_backend("native")
-            return native_backend.get("rmsnorm_fwd")(*args, **kwargs)
-    
-    def rmsnorm_bwd(self, *args, **kwargs):
-        """RMSNorm backward pass with automatic fallback to native."""
-        impl = self._get_impl("rmsnorm_bwd")
-        try:
-            return impl(*args, **kwargs)
-        except Exception as e:
-            logger.warning(f"RmsNorm BWD implementation failed, falling back to native: {e}")
-            self._reset_cache_to_native("rmsnorm_bwd")
-            native_backend = get_backend("native")
-            trimmed_args = args[:-1]  # cut eps
-            return native_backend.get("rmsnorm_bwd")(*trimmed_args, **kwargs)
-    
-    def multi_tensor_adam(self):
-        """Multi-tensor Adam optimizer with automatic fallback to native."""
-        impl = self._get_impl("adam")
-        try:
-            return impl
-        except Exception as e:
-            logger.warning(f"Adam implementation failed, falling back to native: {e}")
-            self._reset_cache_to_native("adam")
-            native_backend = get_backend("native")
-            return native_backend.get("adam")
-    
-    def flash_attention(self, *args, **kwargs):
-        """Flash Attention with automatic fallback to native."""
-        flash_attention_instance = args[0]
-        trimmed_args = args[1:]
-        native_impl = get_backend("native").get("flash_attention")
-        try:
-            selected_impl = self._get_impl("flash_attention")
-            flash_attention_instance.forward = selected_impl.forward.__get__(flash_attention_instance, native_impl)
-            return flash_attention_instance(*trimmed_args, **kwargs)
-        except Exception as e:
-            logger.warning(f"Flash Attention Forward implementation failed, falling back to native: {e}")
-            self._reset_cache_to_native("flash_attention")
-            flash_attention_instance.forward = native_impl.forward.__get__(flash_attention_instance, native_impl)
-            return flash_attention_instance(*trimmed_args, **kwargs)
-
-
-# Backend initialization state
-_backends_initialized = False
-_backend_instance = None
-
-def _initialize_backends():
-    """
-    Initialize all backend registrations.
-    This function is called automatically on first use.
-    """
-    global _backends_initialized, _backend_instance
-    
-    if _backends_initialized:
-        return
-    
-    from .backend_native import register_backend_native
-    register_backend_native()
-    if HAVE_FLAG_GEMS:
-        from .backend_fl import register_backend_fl
-        register_backend_fl()
-    
-    _backend_instance = BackendDispatch()
-    _backends_initialized = True
-    
-    logger.info("Backend system initialized successfully")
-
-# Create backend instance on module import
-_initialize_backends()
-backend = _backend_instance
diff --git a/transformer_engine/plugins/backend_fl.py b/transformer_engine/plugins/backend_fl.py
deleted file mode 100644
index fb73dff8e8..0000000000
--- a/transformer_engine/plugins/backend_fl.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import os
-import torch
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from .import_utils import safety_import
-from .register import register_backend
-from .logger import get_logger
-logger = get_logger()
-
-
-### GEMM
-general_gemm_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'general_gemm_fl')
-### RMSNORM
-apply_normalization_fl = safety_import('transformer_engine.plugins.module._common', 'apply_normalization_fl')
-rmsnorm_bwd_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'rmsnorm_bwd_fl')
-rmsnorm_fwd_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'rmsnorm_fwd_fl')
-### AdamW
-multi_tensor_adam_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'multi_tensor_adam_fl')
-### Flash-Attn
-# Use lazy=True to avoid circular imports
-FlashAttentionFL = safety_import(
-    'transformer_engine.plugins.attention.dot_product_attention.backends',
-    'FlashAttentionFL',
-    lazy=True
-)
-
-def register_backend_fl():
-    # Register TE-FL backend
-    register_backend("te_fl", {
-        "gemm": general_gemm_fl,
-        "apply_normalization": apply_normalization_fl,
-        "rmsnorm_fwd": rmsnorm_fwd_fl,
-        "rmsnorm_bwd": rmsnorm_bwd_fl,
-        "adam": multi_tensor_adam_fl,
-        "flash_attention": FlashAttentionFL,
-    })
diff --git a/transformer_engine/plugins/backend_native.py b/transformer_engine/plugins/backend_native.py
deleted file mode 100644
index b9a4f5b13a..0000000000
--- a/transformer_engine/plugins/backend_native.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import os
-import torch
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from .import_utils import safety_import
-from .register import register_backend
-from .logger import get_logger
-logger = get_logger()
-
-
-### GEMM
-general_gemm_native = safety_import('transformer_engine.pytorch.cpp_extensions', 'general_gemm')
-### RMSNORM
-apply_normalization_native = safety_import('transformer_engine.pytorch.module._common', 'apply_normalization')
-rmsnorm_bwd_native = safety_import('transformer_engine_torch', 'rmsnorm_bwd')
-rmsnorm_fwd_native = safety_import('transformer_engine_torch', 'rmsnorm_fwd')
-### AdamW
-multi_tensor_adam_native = safety_import('transformer_engine_torch', 'multi_tensor_adam')
-### Flash-Attn
-# Use lazy=True to avoid circular imports
-FlashAttentionNative = safety_import(
-    'transformer_engine.pytorch.attention.dot_product_attention.backends',
-    'FlashAttention',
-    lazy=True
-)
-
-# Register native backend
-def register_backend_native():
-    # Note: native_rmsnorm_bwd doesn't take eps as the last argument, so we wrap it
-    def rmsnorm_bwd_native_wrapper(*args, **kwargs):
-        return rmsnorm_bwd_native(*args[:-1], **kwargs)
-    register_backend("native", {
-        "gemm": general_gemm_native,
-        "apply_normalization": apply_normalization_native,
-        "rmsnorm_fwd": rmsnorm_fwd_native,
-        "rmsnorm_bwd": rmsnorm_bwd_native_wrapper,
-        "adam": multi_tensor_adam_native,
-        "flash_attention": FlashAttentionNative,
-    })
diff --git a/transformer_engine/plugins/cpp_extensions/fused_adam.py b/transformer_engine/plugins/cpp_extensions/fused_adam.py
deleted file mode 100644
index d7c9a09baa..0000000000
--- a/transformer_engine/plugins/cpp_extensions/fused_adam.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-from itertools import chain
-from typing import Optional, List, Union
-import warnings
-import os
-
-import torch
-
-def multi_tensor_adam_fl(
-    chunk_size: int,
-    noop_flag: torch.Tensor,
-    tensor_lists: List[List[torch.Tensor]],
-    lr: float,
-    beta1: float,
-    beta2: float,
-    eps: float,
-    step: int,
-    mode: int,
-    bias_correction: int,
-    weight_decay: float,
-    inv_scale: Optional[float] = 1.0,
-    out_dtype: Optional[torch.dtype] = None,
-) -> None:
-
-    num_lists = len(tensor_lists)
-    assert num_lists in [4, 5], f"Expected 4 or 5 tensor lists, got {num_lists}"
-
-    num_tensors = len(tensor_lists[0])
-    assert num_tensors > 0, "No tensors provided"
-
-    for i, lst in enumerate(tensor_lists):
-        assert len(lst) == num_tensors, f"List {i} has {len(lst)} tensors, expected {num_tensors}"
-
-    bias_correction1 = 1.0
-    bias_correction2 = 1.0
-    if bias_correction == 1:
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
-
-    is_adamw = (mode == 1)
-
-    for i in range(num_tensors):
-        g = tensor_lists[0][i]  # grad
-        p = tensor_lists[1][i]  # param
-        m = tensor_lists[2][i]  #
-        v = tensor_lists[3][i]  #
-        p_master = tensor_lists[4][i] if num_lists == 5 else None
-
-        if not g.is_contiguous():
-            g = g.contiguous()
-
-        if inv_scale is not None and inv_scale != 1.0:
-            g = g * inv_scale
-
-        m.mul_(beta1).add_(g, alpha=1 - beta1)
-        # v.mul_(beta2).addcmul_(g, g, value=1 - beta2)
-        v.mul_(beta2).add_(g.mul(g).mul_(1 - beta2))
-
-        m_corr = m.clone()
-        v_corr = v.clone()
-        if bias_correction == 1:
-            m_corr = m_corr / bias_correction1
-            v_corr = v_corr / bias_correction2
-
-        update = m_corr / (v_corr.sqrt() + eps)
-
-        if is_adamw:
-            p.data.mul_(1 - lr * weight_decay)
-        else:
-            update.add_(p, alpha=weight_decay)
-
-        p.data.add_(update, alpha=-lr)
-
-        if p_master is not None:
-            p_master.data.copy_(p.data)
-            out_dtype = p_master.dtype if out_dtype is None else out_dtype
-            p.data = p.data.to(out_dtype)
diff --git a/transformer_engine/plugins/cpp_extensions/gemm.py b/transformer_engine/plugins/cpp_extensions/gemm.py
deleted file mode 100644
index bceff8bc63..0000000000
--- a/transformer_engine/plugins/cpp_extensions/gemm.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-from typing import Iterable, Optional, Tuple, Union, List
-import os
-import functools
-import torch
-import transformer_engine_torch as tex
-from transformer_engine.pytorch.constants import TE_DType
-
-from transformer_engine.pytorch.tensor.quantized_tensor import Quantizer
-
-from ..import_utils import have_flag_gems
-
-HAVE_FLAG_GEMS = have_flag_gems()
-if HAVE_FLAG_GEMS:
-    import flag_gems
-
-__all__ = [
-    "general_gemm_fl",
-]
-
-
-def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
-    """Validate whether a GEMM scaling factor is consistent with its usage"""
-    if required:
-        return scale if scale is not None else 1.0
-    if scale not in (0.0, None):
-        raise ValueError("scale must be zero")
-    return 0.0
-
-
-def general_gemm_fl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    workspace: torch.Tensor,
-    out_dtype: Optional[torch.dtype] = None,
-    quantization_params: Optional[Quantizer] = None,
-    gelu: bool = False,
-    gelu_in: torch.Tensor = None,
-    alpha: float = 1.0,
-    beta: Optional[float] = None,
-    accumulate: bool = False,
-    layout: str = "TN",
-    out: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-    use_split_accumulator: bool = False,
-    grad: bool = False,
-    ub: Union[tex.CommOverlap, tex.CommOverlapP2P] = None,
-    ub_type: tex.CommOverlapType = None,
-    extra_output: Optional[torch.Tensor] = None,
-    bulk_overlap: bool = False,
-) -> Iterable[Optional[torch.Tensor]]:
-
-    assert HAVE_FLAG_GEMS, "Triton-Based General Gemm needs FlagGems"
-    assert not gelu and gelu_in is None, "Triton-Based General Gemm do not support gelu now"
-    assert ub is None and ub_type is None, "Triton-Based General Gemm do not support ub comm in kernels"
-    assert quantization_params is None, "Triton-Based General Gemm do not support quantization now"
-    assert bias is None, "Triton-Based General Gemm do not support bias now"
-    assert layout in ("TN", "NN", "NT"), f"GEMM layout {layout} not supported."
-    assert alpha == 1.0 and beta is None, "Triton-Based General Gemm do not support scaling with alpha and beta"
-    if accumulate:
-        assert out is not None, "When accumulate is True, 'out' must be provided"
-
-    transa = layout[0] == "T"
-    transb = layout[1] == "T"
-
-    alpha = validate_gemm_scale(alpha, True)
-    beta = validate_gemm_scale(beta, accumulate)
-
-    if out is not None:
-        if not out.is_contiguous():
-            raise ValueError("Output tensor is not contiguous.")
-
-    # Use bfloat16 as default bias_dtype
-    bias_dtype = TE_DType[torch.bfloat16 if bias is None else bias.dtype]
-
-    s = -1
-    b = -1
-    orig_A_shape = A.shape
-    orig_B_shape = B.shape
-    shape_a_changed = False
-    shape_b_changed = False
-
-    if A.ndim == 3:
-        A = A.view(-1, A.shape[-1])
-        shape_a_changed = True
-
-    if B.ndim == 3:
-        s, b, _ = B.shape
-        B = B.view(-1, B.shape[-1])
-        shape_b_changed = True
-
-    A_comp = A.T if transa else A
-    B_comp = B.T if transb else B
-
-    out1 = flag_gems.mm(B_comp, A_comp)
-
-    if shape_b_changed:
-        out1 = out1.view(s, b, -1)
-
-    if out_dtype is not None and out1.dtype != out_dtype:
-        out1 = out1.to(out_dtype)
-
-    bias_grad = None
-    gelu_input = None
-    extra_output = None
-    if out is not None:
-        out.add_(out1)
-        return out, bias_grad, gelu_input, extra_output
-    else:
-        return out1, bias_grad, gelu_input, extra_output
diff --git a/transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py b/transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py
deleted file mode 100644
index 6373b999a8..0000000000
--- a/transformer_engine/plugins/cpp_extensions/multi_tensor_apply.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import torch
-from torch.distributed._tensor import DTensor
-
-
-def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
-    """
-    Computes l2 norm for a list of contiguous tensors
-    works as a drop-in replacement for amp_C.multi_tensor_l2norm
-    """
-    l2 = [[(torch.norm(tensor)) for tensor in tensor_list] for tensor_list in tensor_lists]
-    l2_reduced = torch.norm(torch.tensor(l2))
-    l2_cuda = torch.tensor([float(l2_reduced)], dtype=torch.float, device="cuda")
-    return l2_cuda, None
-
-
-def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
-    """Works as a drop-in replacement for amp_C.multi_tensor_scale."""
-    for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-        dst.copy_(src * scale)
diff --git a/transformer_engine/plugins/cpp_extensions/rmsnorm.py b/transformer_engine/plugins/cpp_extensions/rmsnorm.py
deleted file mode 100644
index af8b3bf096..0000000000
--- a/transformer_engine/plugins/cpp_extensions/rmsnorm.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import os
-import torch
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from ..import_utils import safety_import, have_flag_gems
-
-### RMSNORM
-HAVE_FLAG_GEMS = have_flag_gems()
-
-if HAVE_FLAG_GEMS:
-    import flag_gems
-
-def rmsnorm_fwd_fl(
-    input,
-    weight,
-    eps,
-    ln_out,
-    quantizer,
-    odtype,
-    sm_margin,
-    zero_centered_gamma,
-):
-    assert HAVE_FLAG_GEMS, "GEMS is not installed"
-    y, rstdevs = flag_gems.rms_norm_forward(
-        input,
-        [input.shape[-1]],
-        weight,
-        eps,
-    )
-    return y, None, rstdevs
-
-
-def rmsnorm_bwd_fl(
-    dy,
-    x,
-    rsigma,
-    gamma,
-    sm_margin,
-    zero_centered_gamma,
-    eps,
-):
-    assert HAVE_FLAG_GEMS, "GEMS is not installed"
-    dx, dw = flag_gems.rms_norm_backward(
-        dy,
-        x,
-        rsigma,
-        [x.shape[-1]],
-        gamma,
-        eps,
-    )
-    return dx, dw
diff --git a/transformer_engine/plugins/import_utils.py b/transformer_engine/plugins/import_utils.py
deleted file mode 100644
index 76a8dd8846..0000000000
--- a/transformer_engine/plugins/import_utils.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import importlib
-from typing import Any, Optional
-
-from .logger import get_logger
-
-logger = get_logger()
-
-# Safety import cache to avoid circular imports and improve performance
-_import_cache: dict[str, Any] = {}
-
-# Cache for HAVE_FLAG_GEMS check to avoid repeated imports
-_HAVE_FLAG_GEMS_CACHE: Optional[bool] = None
-
-
-class _LazyImport:
-    """Lazy import proxy that defers actual import until first use."""
-    
-    def __init__(self, module_path: str, name: Optional[str] = None):
-        self._module_path = module_path
-        self._name = name
-        self._cache_key = f"{module_path}.{name}" if name else module_path
-        self._imported = None
-    
-    def _import(self):
-        """Perform the actual import."""
-        if self._imported is None:
-            if self._cache_key in _import_cache:
-                self._imported = _import_cache[self._cache_key]
-            else:
-                module = importlib.import_module(self._module_path)
-                if self._name:
-                    self._imported = getattr(module, self._name)
-                else:
-                    self._imported = module
-                _import_cache[self._cache_key] = self._imported
-        return self._imported
-    
-    def __getattr__(self, name: str) -> Any:
-        """Delegate attribute access to the imported object."""
-        return getattr(self._import(), name)
-    
-    def __call__(self, *args, **kwargs) -> Any:
-        """Allow calling if the imported object is callable."""
-        return self._import()(*args, **kwargs)
-    
-    def __repr__(self) -> str:
-        """String representation."""
-        if self._imported is None:
-            return f"<LazyImport: {self._cache_key} (not loaded)>"
-        return repr(self._imported)
-
-
-def safety_import(module_path: str, name: Optional[str] = None, lazy: bool = False) -> Any:
-    """
-    Safely import a module or attribute with lazy loading and caching.
-    
-    This function helps avoid circular imports by deferring imports until
-    they are actually needed, and caches the result for performance.
-    
-    Args:
-        module_path: Full module path
-        name: Optional attribute name to import from the module (e.g., 'FLAttention')
-              If None, returns the module itself.
-        lazy: If True, returns a lazy proxy that defers import until first use.
-              If False (default), imports immediately but caches the result.
-              Use lazy=True when there's a risk of circular imports.
-    
-    Returns:
-        The imported module or attribute (or a lazy proxy if lazy=True).
-    """
-    cache_key = f"{module_path}.{name}" if name else module_path
-    
-    if lazy:
-        # Return lazy proxy that defers import
-        return _LazyImport(module_path, name)
-    
-    # Immediate import with caching
-    if cache_key not in _import_cache:
-        module = importlib.import_module(module_path)
-        if name:
-            _import_cache[cache_key] = getattr(module, name)
-        else:
-            _import_cache[cache_key] = module
-    
-    return _import_cache[cache_key]
-
-
-def have_flag_gems() -> bool:
-    """
-    Check if flag_gems is installed and available.
-    
-    This function caches the result to avoid repeated import attempts.
-    On first check, logs whether flag_gems is available.
-    
-    Returns:
-        True if flag_gems is available, False otherwise.
-    """
-    global _HAVE_FLAG_GEMS_CACHE
-    
-    if _HAVE_FLAG_GEMS_CACHE is None:
-        try:
-            import flag_gems
-            _HAVE_FLAG_GEMS_CACHE = True
-            logger.info("flag_gems is available. FL backend implementations can be used.")
-        except ImportError:
-            _HAVE_FLAG_GEMS_CACHE = False
-            logger.info("flag_gems is not installed. Only native backend implementations will be used.")
-    
-    return _HAVE_FLAG_GEMS_CACHE
diff --git a/transformer_engine/plugins/logger.py b/transformer_engine/plugins/logger.py
deleted file mode 100644
index 83a577024f..0000000000
--- a/transformer_engine/plugins/logger.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import logging
-import sys
-import os
-
-
-class Logger:
-    def __init__(self, name, level=logging.INFO):
-        self.logger = logging.getLogger(name)
-        self.logger.setLevel(level)
-        self.logger.propagate = False
-
-        # Clear existing handlers
-        for handler in self.logger.handlers[:]:
-            self.logger.removeHandler(handler)
-
-        formatter = logging.Formatter(
-            "[%(asctime)s %(name)s %(filename)s:%(lineno)d %(levelname)s] %(message)s"
-        )
-
-        stream_handler = logging.StreamHandler(sys.stdout)
-        stream_handler.setFormatter(formatter)
-
-        self.logger.addHandler(stream_handler)
-
-    def info(self, message):
-        self.logger.info(message)
-
-    def warning(self, message):
-        self.logger.warning(message)
-
-    def error(self, message):
-        self.logger.error(message)
-
-    def critical(self, message):
-        self.logger.critical(message)
-
-    def debug(self, message):
-        self.logger.debug(message)
-
-
-GLOBAL_LOGGER = None
-
-
-def get_logger():
-    global GLOBAL_LOGGER
-    if GLOBAL_LOGGER is None:
-        level = os.getenv("TEFL_LOG_LEVEL", "INFO").upper()
-        GLOBAL_LOGGER = Logger("TE-FL", level)
-    return GLOBAL_LOGGER
diff --git a/transformer_engine/plugins/module/_common.py b/transformer_engine/plugins/module/_common.py
deleted file mode 100644
index ac2cbfdf9b..0000000000
--- a/transformer_engine/plugins/module/_common.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-import os
-import torch
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from ..import_utils import safety_import
-
-### RMSNORM
-rmsnorm_fwd_fl = safety_import('transformer_engine.plugins.cpp_extensions', 'rmsnorm_fwd_fl')
-
-def apply_normalization_fl(
-    inputmat: torch.Tensor,
-    ln_out: torch.Tensor,
-    ln_weight: torch.Tensor,
-    ln_bias: Union[torch.Tensor, None],
-    eps: float,
-    output_quantizer,
-    output_dtype,
-    normalization: str,
-    fwd_ln_sm_margin: int,
-    zero_centered_gamma: bool,
-):
-    assert normalization == "RMSNorm", "Triton-based LayerNorm is not supported in TE-FL"
-    assert ln_bias is None, "Triton-Based RMSNorm do not support bias"
-    normalization_func = rmsnorm_fwd_fl
-    return normalization_func(
-        inputmat,
-        ln_weight,
-        eps,
-        ln_out,
-        output_quantizer,
-        output_dtype,
-        fwd_ln_sm_margin,
-        zero_centered_gamma,
-    )
diff --git a/transformer_engine/plugins/register.py b/transformer_engine/plugins/register.py
deleted file mode 100644
index b92e8617ee..0000000000
--- a/transformer_engine/plugins/register.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2025, BAAI. All rights reserved.
-#
-# See LICENSE for license information.
-
-"""Backend registry for managing multiple backend implementations."""
-import os
-from typing import Any, Dict, Optional
-
-from .logger import get_logger
-logger = get_logger()
-
-
-class Backend:
-    """
-    A backend that can register and provide implementations for various operations.
-    
-    Each backend can register its own implementations for operations like gemm,
-    rmsnorm_fwd, etc. If an operation is not registered, it will fallback to
-    the native backend.
-    
-    Usage:
-        backend = Backend("my_backend")
-        backend.register("gemm", my_gemm_function)
-        backend.register("rmsnorm_fwd", my_rmsnorm_fwd)
-        
-        # Use the backend
-        result = backend.gemm(...)
-    """
-    
-    def __init__(self, name: str):
-        """
-        Initialize a backend.
-        
-        Args:
-            name: Name of the backend (e.g., "native", "te_fl", "custom")
-        """
-        self.name = name
-        self._implementations: Dict[str, Any] = {}
-    
-    def register(self, operation: str, implementation: Any) -> None:
-        """
-        Register an implementation for an operation.
-        
-        Args:
-            operation: Name of the operation (e.g., "gemm", "rmsnorm_fwd")
-            implementation: Function or class to register
-        """
-        self._implementations[operation] = implementation
-        logger.info(f"Backend '{self.name}' registered implementation for '{operation}'")
-    
-    def has(self, operation: str) -> bool:
-        """Check if this backend has an implementation for the operation."""
-        return operation in self._implementations
-    
-    def get(self, operation: str, default: Optional[Any] = None) -> Optional[Any]:
-        """Get the implementation for an operation, or return default if not found."""
-        return self._implementations.get(operation, default)
-    
-    def __getattr__(self, operation: str) -> Any:
-        """
-        Allow accessing operations as attributes (e.g., backend.gemm).
-        Returns the registered implementation if available.
-        """
-        if operation.startswith("_") or operation in ("name", "register", "has", "get"):
-            return super().__getattribute__(operation)
-        
-        if operation in self._implementations:
-            return self._implementations[operation]
-        
-        raise AttributeError(
-            f"Backend '{self.name}' does not have implementation for '{operation}'. "
-            f"Available operations: {list(self._implementations.keys())}"
-        )
-
-
-def get_selected_backend() -> Backend:
-    """
-    Get the selected backend instance based on global environment variable.
-    No longer depends on operation-specific flags.
-    
-    Returns:
-        Backend instance to use
-    """
-    global_flag = os.environ.get("USE_TRANSFORMER_ENGINE_FL", "0")
-    if global_flag.lower() in ("1", "true", "yes", "on"):
-        backend_name = "te_fl"
-    else:
-        backend_name = "native"
-    return get_backend(backend_name)
-
-
-# Global backends registry
-_backends: Dict[str, Backend] = {}
-
-
-def get_backend(name: str) -> Backend:
-    """
-    Get a backend by name. Creates it if it doesn't exist.
-    
-    Args:
-        name: Name of the backend
-    
-    Returns:
-        Backend instance
-    """
-    if name not in _backends:
-        _backends[name] = Backend(name)
-    return _backends[name]
-
-
-def register_backend(backend_name: str, implementations: Dict[str, Any]):
-    """
-    Register backend implementations.
-    
-    Args:
-        backend_name: Name of the backend (e.g., "native", "te_fl", "custom")
-        implementations: Dictionary mapping operation names to their implementations.
-                        Example: {"gemm": native_gemm, "flash_attention": native_flash_attn}
-    
-    Usage:
-        # Register native backend
-        register_backend("native", {
-            "gemm": gemm_native,
-            "rmsnorm_fwd": rmsnorm_fwd_native,
-            "flash_attention": flash_attn_native,
-        })
-        
-        # Register TE-FL backend
-        register_backend("te_fl", {
-            "gemm": gemm_fl,
-            "rmsnorm_fwd": rmsnorm_fwd_fl,
-            "flash_attention": flash_attn_fl,
-        })
-        
-        # Register custom backend
-        register_backend("custom", {
-            "gemm": custom_gemm,
-            "custom_op": custom_function,
-        })
-    """
-    backend = get_backend(backend_name)
-    
-    for operation, implementation in implementations.items():
-        backend.register(operation, implementation)
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 2d3fea8754..98b26ba81b 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -58,10 +58,13 @@
 from transformer_engine.pytorch.attention.dot_product_attention.backends import (
     UnfusedDotProductAttention,
     FusedAttention,
-    FlashAttention,
+    FlashAttention
 )
 
-from transformer_engine.plugins.backend import backend
+# Save reference to native FlashAttention for fallback
+_FlashAttentionNative = FlashAttention
+# Use plugin system's flash_attention if available, otherwise use native
+FlashAttention = getattr(tex, 'flash_attention', _FlashAttentionNative)
 
 # Setup Attention Logging
 attn_log.setup_logging()
@@ -1390,8 +1393,7 @@ def forward(
                         max_seqlen_kv,
                         alibi_slopes=alibi_slopes,
                     )
-                return backend.flash_attention(
-                    self.flash_attention,
+                return self.flash_attention(
                     query_layer,
                     key_layer,
                     value_layer,
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index c660f422ad..6c0f969e47 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -75,7 +75,6 @@
     general_gemm,
 )
 
-from transformer_engine.plugins.backend import backend
 
 __all__ = ["LayerNormLinear"]
 
@@ -207,7 +206,7 @@ def forward(
 
         # Apply normalization
         nvtx_range_push(f"{nvtx_label}.norm")
-        ln_out, mu, rsigma = backend.apply_normalization(
+        ln_out, mu, rsigma = apply_normalization(
             inputmat,
             None,  # ln_out
             ln_weight,
@@ -343,7 +342,7 @@ def forward(
         # Note: y = x * w^T
         # ------------------------------------------------------
         nvtx_range_push(f"{nvtx_label}.gemm")
-        gemm_out, *_, reduce_scatter_out = backend.gemm(
+        gemm_out, *_, reduce_scatter_out = general_gemm(
             weightmat,
             ln_out_total,
             get_workspace(),
@@ -717,7 +716,7 @@ def backward(
             # dgrad GEMM
             # Note: dx = dy * w
             nvtx_range_push(f"{nvtx_label}.dgrad_gemm")
-            gemm_out, *_, reduce_scatter_out = backend.gemm(
+            gemm_out, *_, reduce_scatter_out = general_gemm(
                 weight,
                 grad_output,
                 get_workspace(),
@@ -881,7 +880,7 @@ def wgrad_gemm(
 
                     """
                     nvtx_range_push(f"{nvtx_label}.wgrad_gemm")
-                    dw, db, *_ = backend.gemm(x, dy, **wgrad_gemm_kwargs)
+                    dw, db, *_ = general_gemm(x, dy, **wgrad_gemm_kwargs)
                     nvtx_range_pop(f"{nvtx_label}.wgrad_gemm")
                     return dw, db
 
@@ -966,7 +965,7 @@ def wgrad_gemm(
                 )
                 dgrad = dgrad.reshape(inputmat.size())
             elif ctx.normalization == "RMSNorm":
-                dgrad, dgamma = backend.rmsnorm_bwd(
+                dgrad, dgamma = tex.rmsnorm_bwd(
                     dgrad,
                     inputmat,
                     rsigma,
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index 0b715c7a72..42f29d06ee 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -71,7 +71,6 @@
 from ..cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...debug.pytorch.debug_state import TEDebugState
 
-from transformer_engine.plugins.backend import backend
 
 __all__ = ["Linear"]
 
@@ -308,7 +307,7 @@ def forward(
         # Note: y = x * w^T
         # ------------------------------------------------------
         nvtx_range_push(f"{nvtx_label}.gemm")
-        gemm_out, *_, reduce_scatter_out = backend.gemm(
+        gemm_out, *_, reduce_scatter_out = general_gemm(
             weightmat,
             inputmat_total,
             get_workspace(),
@@ -711,7 +710,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 # Note: dx = dy * w
 
                 nvtx_range_push(f"{nvtx_label}.dgrad_gemm")
-                gemm_out, *_, reduce_scatter_out = backend.gemm(
+                gemm_out, *_, reduce_scatter_out = general_gemm(
                     weight_fp8,
                     grad_output,
                     get_workspace(),
@@ -874,7 +873,7 @@ def wgrad_gemm(
 
                     """
                     nvtx_range_push(f"{nvtx_label}.wgrad_gemm")
-                    dw, db, *_ = backend.gemm(x, dy, **wgrad_gemm_kwargs)
+                    dw, db, *_ = general_gemm(x, dy, **wgrad_gemm_kwargs)
                     nvtx_range_pop(f"{nvtx_label}.wgrad_gemm")
                     return dw, db
 
diff --git a/transformer_engine/pytorch/ops/basic/rmsnorm.py b/transformer_engine/pytorch/ops/basic/rmsnorm.py
index 5054b5ea8c..28126fd44f 100644
--- a/transformer_engine/pytorch/ops/basic/rmsnorm.py
+++ b/transformer_engine/pytorch/ops/basic/rmsnorm.py
@@ -26,7 +26,6 @@
 from ..op import BasicOperation, OperationContext
 from .._common import maybe_autocast_dtype, maybe_dequantize
 
-from transformer_engine.plugins.backend import backend
 
 
 class RMSNorm(BasicOperation):
@@ -186,7 +185,7 @@ def op_forward(
 
         # Compute RMSNorm
         sm_margin = self._sm_margins["forward" if ctx.requires_grad else "inference"]
-        y, _, rstdevs = backend.rmsnorm_fwd(
+        y, _, rstdevs = rmsnorm_fwd(
             x,
             w,
             self.eps,
@@ -226,7 +225,7 @@ def op_backward(
         dy = maybe_dequantize(grad_output.contiguous(), dtype).view(x.size())
         w = maybe_dequantize(self.weight, dtype).view((inner_dim,))
 
-        dx, dw = backend.rmsnorm_bwd(
+        dx, dw = rmsnorm_bwd(
             dy,
             x,
             rstdevs,
diff --git a/transformer_engine/pytorch/optimizers/__init__.py b/transformer_engine/pytorch/optimizers/__init__.py
index 6d44a8a6e5..a19c797dea 100644
--- a/transformer_engine/pytorch/optimizers/__init__.py
+++ b/transformer_engine/pytorch/optimizers/__init__.py
@@ -14,6 +14,3 @@
 from .fused_adam import FusedAdam
 from .fused_sgd import FusedSGD
 from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier
-
-from transformer_engine.plugins.cpp_extensions import multi_tensor_l2_norm_fl as multi_tensor_l2norm
-from transformer_engine.plugins.cpp_extensions import multi_tensor_scale_fl as multi_tensor_scale
diff --git a/transformer_engine/pytorch/optimizers/fused_adam.py b/transformer_engine/pytorch/optimizers/fused_adam.py
index 10fd480476..b2ddd0adf8 100644
--- a/transformer_engine/pytorch/optimizers/fused_adam.py
+++ b/transformer_engine/pytorch/optimizers/fused_adam.py
@@ -15,7 +15,6 @@
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
 from .multi_tensor_apply import multi_tensor_applier
 
-from transformer_engine.plugins.backend import backend
 
 def get_fp8_meta(fp8_tensor):
     """FP8 metadata getter."""
@@ -712,7 +711,7 @@ def apply_multi_tensor_adam(adam_func, tensor_lists, inv_scale=None, out_dtype=N
                             self.multi_tensor_adam_param_remainder, tensor_lists
                         )
                     else:
-                        apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
+                        apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
                 if len(p_fp8_model) > 0:
                     tensor_lists = [
                         g_of_fp8_model,
@@ -732,14 +731,14 @@ def apply_multi_tensor_adam(adam_func, tensor_lists, inv_scale=None, out_dtype=N
                         m_of_f32_model,
                         v_of_f32_model,
                     ]
-                    apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
+                    apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
             else:  # self.master_weights=False and self.capturable=False
                 if len(p_f16_model) > 0:
                     tensor_lists = [g_of_f16_model, p_f16_model, m_of_f16_model, v_of_f16_model]
-                    apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
+                    apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
                 if len(p_f32_model) > 0:
                     tensor_lists = [g_of_f32_model, p_f32_model, m_of_f32_model, v_of_f32_model]
-                    apply_multi_tensor_adam(backend.multi_tensor_adam(), tensor_lists)
+                    apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
 
             # Scaling
             for name in ["exp_avg", "exp_avg_sq", "master_param"]:
diff --git a/transformer_engine/pytorch/setup.py b/transformer_engine/pytorch/setup.py
index 7a81550047..9ea45f3fad 100644
--- a/transformer_engine/pytorch/setup.py
+++ b/transformer_engine/pytorch/setup.py
@@ -24,7 +24,6 @@
 
 FORCE_BUILD = os.getenv("NVTE_PYTORCH_FORCE_BUILD", "FALSE") == "TRUE"
 FORCE_CXX11_ABI = os.getenv("NVTE_PYTORCH_FORCE_CXX11_ABI", "FALSE") == "TRUE"
-SKIP_CUDA_BUILD = os.getenv("NVTE_PYTORCH_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
 PACKAGE_NAME = "transformer_engine_torch"
 BASE_WHEEL_URL = (
     "https://github.com/NVIDIA/TransformerEngine/releases/download/{tag_name}/{wheel_name}"

From 57adff459eaa88c1f11f9e058816ebe9983de72c Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Sun, 4 Jan 2026 15:26:17 +0800
Subject: [PATCH 17/59] Add missing __init__.py files and policy test suite
 (#9)

# Description

- Add missing __init__.py files to
transformer_engine/plugin/core/backends/flagos/attention/ directory tree
to fix import errors when accessing these modules as Python packages
- Add comprehensive test suite (test_policy.py) covering the TE-FL
scheduling policy system including:
SelectionPolicy creation and configuration
Environment variable parsing (TE_FL_PREFER, TE_FL_STRICT, etc.)
Policy context managers
Vendor filtering (allow/deny)
Thread safety validation
Minor code style improvements

Fixes # (issue)

## Type of change

- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Change A
- Change B

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes
---
 transformer_engine/__init__.py                |   1 -
 .../backends/flagos/attention/__init__.py     |   3 +
 .../dot_product_attention/__init__.py         |   3 +
 .../plugin/tests/test_policy.py               | 726 ++++++++++++++++++
 .../dot_product_attention.py                  |   2 +-
 5 files changed, 733 insertions(+), 2 deletions(-)
 create mode 100644 transformer_engine/plugin/core/backends/flagos/attention/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py
 create mode 100644 transformer_engine/plugin/tests/test_policy.py

diff --git a/transformer_engine/__init__.py b/transformer_engine/__init__.py
index c9cbe3b257..e51f03e3d8 100644
--- a/transformer_engine/__init__.py
+++ b/transformer_engine/__init__.py
@@ -8,7 +8,6 @@
 
 import os
 from importlib import metadata
-
 import transformer_engine.common
 
 try:
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/__init__.py b/transformer_engine/plugin/core/backends/flagos/attention/__init__.py
new file mode 100644
index 0000000000..88988bab64
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/attention/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py
new file mode 100644
index 0000000000..88988bab64
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
\ No newline at end of file
diff --git a/transformer_engine/plugin/tests/test_policy.py b/transformer_engine/plugin/tests/test_policy.py
new file mode 100644
index 0000000000..f56f5f2833
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_policy.py
@@ -0,0 +1,726 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Test suite for TE-FL scheduling policy system.
+
+This module tests:
+1. SelectionPolicy creation and configuration
+2. Environment variable parsing
+3. Policy context managers
+4. Vendor filtering (allow/deny)
+5. Per-operator custom ordering
+6. PolicyManager singleton and thread safety
+7. Integration with OpManager
+"""
+
+import os
+import sys
+import threading
+import unittest
+from unittest.mock import patch
+from typing import List, Dict
+
+
+class TestSelectionPolicy(unittest.TestCase):
+    """Test SelectionPolicy dataclass and methods"""
+
+    def setUp(self):
+        """Import policy module fresh for each test"""
+        from transformer_engine.plugin.core.policy import (
+            SelectionPolicy,
+            PREFER_DEFAULT,
+            PREFER_VENDOR,
+            PREFER_REFERENCE,
+        )
+        self.SelectionPolicy = SelectionPolicy
+        self.PREFER_DEFAULT = PREFER_DEFAULT
+        self.PREFER_VENDOR = PREFER_VENDOR
+        self.PREFER_REFERENCE = PREFER_REFERENCE
+
+    def test_default_policy_creation(self):
+        """Test creating policy with default values"""
+        policy = self.SelectionPolicy.from_dict()
+
+        self.assertEqual(policy.prefer, self.PREFER_DEFAULT)
+        self.assertFalse(policy.strict)
+        self.assertEqual(policy.per_op_order, ())
+        self.assertEqual(policy.deny_vendors, frozenset())
+        self.assertIsNone(policy.allow_vendors)
+        print("    [PASS] Default policy creation")
+
+    def test_policy_with_prefer_vendor(self):
+        """Test creating policy with vendor preference"""
+        policy = self.SelectionPolicy.from_dict(prefer="vendor")
+
+        self.assertEqual(policy.prefer, "vendor")
+        self.assertEqual(policy.get_default_order(), ["vendor", "flagos", "reference"])
+        print("    [PASS] Policy with vendor preference")
+
+    def test_policy_with_prefer_reference(self):
+        """Test creating policy with reference preference"""
+        policy = self.SelectionPolicy.from_dict(prefer="reference")
+
+        self.assertEqual(policy.prefer, "reference")
+        self.assertEqual(policy.get_default_order(), ["reference", "flagos", "vendor"])
+        print("    [PASS] Policy with reference preference")
+
+    def test_policy_with_prefer_flagos(self):
+        """Test creating policy with flagos preference (default)"""
+        policy = self.SelectionPolicy.from_dict(prefer="flagos")
+
+        self.assertEqual(policy.prefer, "flagos")
+        self.assertEqual(policy.get_default_order(), ["flagos", "vendor", "reference"])
+        print("    [PASS] Policy with flagos preference")
+
+    def test_invalid_prefer_value(self):
+        """Test that invalid prefer value raises error"""
+        with self.assertRaises(ValueError) as context:
+            self.SelectionPolicy.from_dict(prefer="invalid")
+
+        self.assertIn("Invalid prefer value", str(context.exception))
+        print("    [PASS] Invalid prefer value raises error")
+
+    def test_strict_mode(self):
+        """Test strict mode setting"""
+        policy = self.SelectionPolicy.from_dict(strict=True)
+
+        self.assertTrue(policy.strict)
+        print("    [PASS] Strict mode setting")
+
+    def test_deny_vendors(self):
+        """Test deny vendors configuration"""
+        policy = self.SelectionPolicy.from_dict(deny_vendors={"rocm", "dcu"})
+
+        self.assertEqual(policy.deny_vendors, frozenset({"rocm", "dcu"}))
+        self.assertFalse(policy.is_vendor_allowed("rocm"))
+        self.assertFalse(policy.is_vendor_allowed("dcu"))
+        self.assertTrue(policy.is_vendor_allowed("cuda"))
+        print("    [PASS] Deny vendors configuration")
+
+    def test_allow_vendors(self):
+        """Test allow vendors whitelist"""
+        policy = self.SelectionPolicy.from_dict(allow_vendors={"cuda"})
+
+        self.assertEqual(policy.allow_vendors, frozenset({"cuda"}))
+        self.assertTrue(policy.is_vendor_allowed("cuda"))
+        self.assertFalse(policy.is_vendor_allowed("rocm"))
+        print("    [PASS] Allow vendors whitelist")
+
+    def test_deny_overrides_allow(self):
+        """Test that deny takes precedence over allow"""
+        policy = self.SelectionPolicy.from_dict(
+            allow_vendors={"cuda", "rocm"},
+            deny_vendors={"rocm"},
+        )
+
+        self.assertTrue(policy.is_vendor_allowed("cuda"))
+        self.assertFalse(policy.is_vendor_allowed("rocm"))
+        print("    [PASS] Deny overrides allow")
+
+    def test_per_op_order(self):
+        """Test per-operator custom ordering"""
+        policy = self.SelectionPolicy.from_dict(
+            per_op_order={
+                "layernorm_fwd": ["vendor", "flagos"],
+                "rmsnorm_fwd": ["flagos", "reference"],
+            }
+        )
+
+        self.assertEqual(policy.get_per_op_order("layernorm_fwd"), ["vendor", "flagos"])
+        self.assertEqual(policy.get_per_op_order("rmsnorm_fwd"), ["flagos", "reference"])
+        self.assertIsNone(policy.get_per_op_order("unknown_op"))
+        print("    [PASS] Per-operator custom ordering")
+
+    def test_policy_fingerprint(self):
+        """Test policy fingerprint generation"""
+        policy1 = self.SelectionPolicy.from_dict(prefer="vendor", strict=True)
+        policy2 = self.SelectionPolicy.from_dict(prefer="vendor", strict=True)
+        policy3 = self.SelectionPolicy.from_dict(prefer="flagos", strict=True)
+
+        self.assertEqual(policy1.fingerprint(), policy2.fingerprint())
+        self.assertNotEqual(policy1.fingerprint(), policy3.fingerprint())
+        print("    [PASS] Policy fingerprint generation")
+
+    def test_policy_immutability(self):
+        """Test that SelectionPolicy is immutable (frozen dataclass)"""
+        policy = self.SelectionPolicy.from_dict(prefer="vendor")
+
+        with self.assertRaises(AttributeError):
+            policy.prefer = "flagos"  # Should fail - frozen dataclass
+        print("    [PASS] Policy immutability")
+
+    def test_policy_hashable(self):
+        """Test that SelectionPolicy is hashable (can be used in sets/dicts)"""
+        policy1 = self.SelectionPolicy.from_dict(prefer="vendor")
+        policy2 = self.SelectionPolicy.from_dict(prefer="vendor")
+
+        policy_set = {policy1, policy2}
+        self.assertEqual(len(policy_set), 1)  # Same policy, should dedupe
+        print("    [PASS] Policy hashable")
+
+
+class TestPolicyManager(unittest.TestCase):
+    """Test PolicyManager singleton and state management"""
+
+    def setUp(self):
+        """Reset policy manager state before each test"""
+        from transformer_engine.plugin.core.policy import (
+            PolicyManager,
+            reset_global_policy,
+        )
+        reset_global_policy()
+        self.PolicyManager = PolicyManager
+
+    def tearDown(self):
+        """Clean up after each test"""
+        from transformer_engine.plugin.core.policy import reset_global_policy
+        reset_global_policy()
+        # Clear any test environment variables
+        for key in ["TE_FL_PREFER", "TE_FL_PREFER_VENDOR", "TE_FL_STRICT",
+                    "TE_FL_DENY_VENDORS", "TE_FL_ALLOW_VENDORS", "TE_FL_PER_OP"]:
+            os.environ.pop(key, None)
+
+    def test_singleton_pattern(self):
+        """Test PolicyManager is a singleton"""
+        manager1 = self.PolicyManager.get_instance()
+        manager2 = self.PolicyManager.get_instance()
+
+        self.assertIs(manager1, manager2)
+        print("    [PASS] PolicyManager singleton pattern")
+
+    def test_policy_epoch(self):
+        """Test policy epoch tracking"""
+        from transformer_engine.plugin.core.policy import (
+            get_policy_epoch,
+            bump_policy_epoch,
+        )
+
+        initial_epoch = get_policy_epoch()
+        new_epoch = bump_policy_epoch()
+
+        self.assertEqual(new_epoch, initial_epoch + 1)
+        self.assertEqual(get_policy_epoch(), new_epoch)
+        print("    [PASS] Policy epoch tracking")
+
+    def test_global_policy_set_and_get(self):
+        """Test setting and getting global policy"""
+        from transformer_engine.plugin.core.policy import (
+            SelectionPolicy,
+            set_global_policy,
+            get_policy,
+        )
+
+        custom_policy = SelectionPolicy.from_dict(prefer="vendor", strict=True)
+        old_policy = set_global_policy(custom_policy)
+
+        current = get_policy()
+        self.assertEqual(current.prefer, "vendor")
+        self.assertTrue(current.strict)
+        print("    [PASS] Global policy set and get")
+
+    def test_reset_global_policy(self):
+        """Test resetting global policy to env defaults"""
+        from transformer_engine.plugin.core.policy import (
+            SelectionPolicy,
+            set_global_policy,
+            reset_global_policy,
+            get_policy,
+        )
+
+        # Set custom policy
+        custom_policy = SelectionPolicy.from_dict(prefer="vendor")
+        set_global_policy(custom_policy)
+
+        # Reset to defaults
+        reset_global_policy()
+
+        current = get_policy()
+        self.assertEqual(current.prefer, "flagos")  # Default
+        print("    [PASS] Reset global policy")
+
+
+class TestEnvironmentVariables(unittest.TestCase):
+    """Test environment variable parsing"""
+
+    def setUp(self):
+        """Clear environment and reset policy"""
+        from transformer_engine.plugin.core.policy import reset_global_policy
+        reset_global_policy()
+        # Clear all test env vars
+        for key in ["TE_FL_PREFER", "TE_FL_PREFER_VENDOR", "TE_FL_STRICT",
+                    "TE_FL_DENY_VENDORS", "TE_FL_ALLOW_VENDORS", "TE_FL_PER_OP"]:
+            os.environ.pop(key, None)
+
+    def tearDown(self):
+        """Clean up environment"""
+        for key in ["TE_FL_PREFER", "TE_FL_PREFER_VENDOR", "TE_FL_STRICT",
+                    "TE_FL_DENY_VENDORS", "TE_FL_ALLOW_VENDORS", "TE_FL_PER_OP"]:
+            os.environ.pop(key, None)
+        from transformer_engine.plugin.core.policy import reset_global_policy
+        reset_global_policy()
+
+    def test_te_fl_prefer_flagos(self):
+        """Test TE_FL_PREFER=flagos"""
+        os.environ["TE_FL_PREFER"] = "flagos"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.prefer, "flagos")
+        print("    [PASS] TE_FL_PREFER=flagos")
+
+    def test_te_fl_prefer_vendor(self):
+        """Test TE_FL_PREFER=vendor"""
+        os.environ["TE_FL_PREFER"] = "vendor"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.prefer, "vendor")
+        print("    [PASS] TE_FL_PREFER=vendor")
+
+    def test_te_fl_prefer_reference(self):
+        """Test TE_FL_PREFER=reference"""
+        os.environ["TE_FL_PREFER"] = "reference"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.prefer, "reference")
+        print("    [PASS] TE_FL_PREFER=reference")
+
+    def test_te_fl_prefer_vendor_legacy(self):
+        """Test legacy TE_FL_PREFER_VENDOR=1"""
+        os.environ["TE_FL_PREFER_VENDOR"] = "1"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.prefer, "vendor")
+        print("    [PASS] TE_FL_PREFER_VENDOR=1 (legacy)")
+
+    def test_te_fl_prefer_overrides_legacy(self):
+        """Test that TE_FL_PREFER takes precedence over TE_FL_PREFER_VENDOR"""
+        os.environ["TE_FL_PREFER"] = "reference"
+        os.environ["TE_FL_PREFER_VENDOR"] = "1"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.prefer, "reference")  # TE_FL_PREFER wins
+        print("    [PASS] TE_FL_PREFER overrides TE_FL_PREFER_VENDOR")
+
+    def test_te_fl_strict(self):
+        """Test TE_FL_STRICT=1"""
+        os.environ["TE_FL_STRICT"] = "1"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertTrue(policy.strict)
+        print("    [PASS] TE_FL_STRICT=1")
+
+    def test_te_fl_deny_vendors(self):
+        """Test TE_FL_DENY_VENDORS parsing"""
+        os.environ["TE_FL_DENY_VENDORS"] = "rocm,dcu,intel"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.deny_vendors, frozenset({"rocm", "dcu", "intel"}))
+        print("    [PASS] TE_FL_DENY_VENDORS parsing")
+
+    def test_te_fl_allow_vendors(self):
+        """Test TE_FL_ALLOW_VENDORS parsing"""
+        os.environ["TE_FL_ALLOW_VENDORS"] = "cuda,rocm"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.allow_vendors, frozenset({"cuda", "rocm"}))
+        print("    [PASS] TE_FL_ALLOW_VENDORS parsing")
+
+    def test_te_fl_per_op(self):
+        """Test TE_FL_PER_OP parsing"""
+        os.environ["TE_FL_PER_OP"] = "layernorm_fwd=vendor|flagos;rmsnorm_fwd=flagos|reference"
+
+        from transformer_engine.plugin.core.policy import policy_from_env
+        policy = policy_from_env()
+
+        self.assertEqual(policy.get_per_op_order("layernorm_fwd"), ["vendor", "flagos"])
+        self.assertEqual(policy.get_per_op_order("rmsnorm_fwd"), ["flagos", "reference"])
+        print("    [PASS] TE_FL_PER_OP parsing")
+
+
+class TestContextManagers(unittest.TestCase):
+    """Test policy context managers"""
+
+    def setUp(self):
+        """Reset policy before each test"""
+        from transformer_engine.plugin.core.policy import reset_global_policy
+        reset_global_policy()
+
+    def tearDown(self):
+        """Clean up after test"""
+        from transformer_engine.plugin.core.policy import reset_global_policy
+        reset_global_policy()
+
+    def test_policy_context(self):
+        """Test basic policy_context usage"""
+        from transformer_engine.plugin.core.policy import (
+            SelectionPolicy,
+            policy_context,
+            get_policy,
+        )
+
+        original = get_policy()
+        custom = SelectionPolicy.from_dict(prefer="vendor", strict=True)
+
+        with policy_context(custom):
+            inside = get_policy()
+            self.assertEqual(inside.prefer, "vendor")
+            self.assertTrue(inside.strict)
+
+        after = get_policy()
+        self.assertEqual(after.prefer, original.prefer)
+        print("    [PASS] policy_context usage")
+
+    def test_with_preference(self):
+        """Test with_preference context manager"""
+        from transformer_engine.plugin.core.policy import (
+            with_preference,
+            get_policy,
+        )
+
+        original = get_policy()
+
+        with with_preference("vendor"):
+            self.assertEqual(get_policy().prefer, "vendor")
+
+        with with_preference("reference"):
+            self.assertEqual(get_policy().prefer, "reference")
+
+        self.assertEqual(get_policy().prefer, original.prefer)
+        print("    [PASS] with_preference context manager")
+
+    def test_with_strict_mode(self):
+        """Test with_strict_mode context manager"""
+        from transformer_engine.plugin.core.policy import (
+            with_strict_mode,
+            get_policy,
+        )
+
+        original = get_policy()
+
+        with with_strict_mode():
+            self.assertTrue(get_policy().strict)
+
+        self.assertEqual(get_policy().strict, original.strict)
+        print("    [PASS] with_strict_mode context manager")
+
+    def test_with_allowed_vendors(self):
+        """Test with_allowed_vendors context manager"""
+        from transformer_engine.plugin.core.policy import (
+            with_allowed_vendors,
+            get_policy,
+        )
+
+        with with_allowed_vendors("cuda", "rocm"):
+            policy = get_policy()
+            self.assertEqual(policy.allow_vendors, frozenset({"cuda", "rocm"}))
+
+        self.assertIsNone(get_policy().allow_vendors)
+        print("    [PASS] with_allowed_vendors context manager")
+
+    def test_with_denied_vendors(self):
+        """Test with_denied_vendors context manager"""
+        from transformer_engine.plugin.core.policy import (
+            with_denied_vendors,
+            get_policy,
+        )
+
+        with with_denied_vendors("rocm", "dcu"):
+            policy = get_policy()
+            self.assertIn("rocm", policy.deny_vendors)
+            self.assertIn("dcu", policy.deny_vendors)
+
+        self.assertEqual(get_policy().deny_vendors, frozenset())
+        print("    [PASS] with_denied_vendors context manager")
+
+    def test_nested_contexts(self):
+        """Test nested context managers"""
+        from transformer_engine.plugin.core.policy import (
+            with_preference,
+            with_strict_mode,
+            get_policy,
+        )
+
+        with with_preference("vendor"):
+            self.assertEqual(get_policy().prefer, "vendor")
+
+            with with_strict_mode():
+                policy = get_policy()
+                # Note: with_strict_mode creates new policy with current prefer
+                self.assertTrue(policy.strict)
+
+            # Back to vendor preference, not strict
+            self.assertEqual(get_policy().prefer, "vendor")
+
+        # Back to default
+        self.assertEqual(get_policy().prefer, "flagos")
+        print("    [PASS] Nested context managers")
+
+
+class TestTokenMatching(unittest.TestCase):
+    """Test token matching for implementation selection"""
+
+    def test_match_flagos_token(self):
+        """Test matching 'flagos' token"""
+        from transformer_engine.plugin.core.types import OpImpl, BackendImplKind, match_token
+
+        impl = OpImpl(
+            op_name="test_op",
+            impl_id="test.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda: None,
+        )
+
+        self.assertTrue(match_token(impl, "flagos"))
+        self.assertFalse(match_token(impl, "vendor"))
+        self.assertFalse(match_token(impl, "reference"))
+        print("    [PASS] Match flagos token")
+
+    def test_match_vendor_token(self):
+        """Test matching 'vendor' token"""
+        from transformer_engine.plugin.core.types import OpImpl, BackendImplKind, match_token
+
+        impl = OpImpl(
+            op_name="test_op",
+            impl_id="test.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda: None,
+            vendor="cuda",
+        )
+
+        self.assertTrue(match_token(impl, "vendor"))
+        self.assertFalse(match_token(impl, "flagos"))
+        print("    [PASS] Match vendor token")
+
+    def test_match_specific_vendor_token(self):
+        """Test matching 'vendor:<name>' token"""
+        from transformer_engine.plugin.core.types import OpImpl, BackendImplKind, match_token
+
+        impl = OpImpl(
+            op_name="test_op",
+            impl_id="test.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda: None,
+            vendor="cuda",
+        )
+
+        self.assertTrue(match_token(impl, "vendor:cuda"))
+        self.assertFalse(match_token(impl, "vendor:rocm"))
+        print("    [PASS] Match specific vendor token")
+
+    def test_match_impl_token(self):
+        """Test matching 'impl:<id>' token"""
+        from transformer_engine.plugin.core.types import OpImpl, BackendImplKind, match_token
+
+        impl = OpImpl(
+            op_name="test_op",
+            impl_id="layernorm_cuda_v2",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda: None,
+            vendor="cuda",
+        )
+
+        self.assertTrue(match_token(impl, "impl:layernorm_cuda_v2"))
+        self.assertFalse(match_token(impl, "impl:other_impl"))
+        print("    [PASS] Match impl token")
+
+    def test_match_reference_token(self):
+        """Test matching 'reference' token"""
+        from transformer_engine.plugin.core.types import OpImpl, BackendImplKind, match_token
+
+        impl = OpImpl(
+            op_name="test_op",
+            impl_id="test.reference",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda: None,
+        )
+
+        self.assertTrue(match_token(impl, "reference"))
+        self.assertFalse(match_token(impl, "flagos"))
+        self.assertFalse(match_token(impl, "vendor"))
+        print("    [PASS] Match reference token")
+
+
+class TestThreadSafety(unittest.TestCase):
+    """Test thread safety of PolicyManager"""
+
+    def test_concurrent_policy_access(self):
+        """Test concurrent access to policy"""
+        from transformer_engine.plugin.core.policy import (
+            SelectionPolicy,
+            set_global_policy,
+            get_policy,
+            reset_global_policy,
+        )
+
+        reset_global_policy()
+        errors = []
+        results = []
+
+        def worker(prefer_value: str, worker_id: int):
+            try:
+                for _ in range(100):
+                    policy = SelectionPolicy.from_dict(prefer=prefer_value)
+                    set_global_policy(policy)
+                    current = get_policy()
+                    # Policy should be one of the valid values
+                    if current.prefer not in ["flagos", "vendor", "reference"]:
+                        errors.append(f"Worker {worker_id}: Invalid prefer value {current.prefer}")
+                results.append(worker_id)
+            except Exception as e:
+                errors.append(f"Worker {worker_id}: {e}")
+
+        threads = [
+            threading.Thread(target=worker, args=("flagos", 0)),
+            threading.Thread(target=worker, args=("vendor", 1)),
+            threading.Thread(target=worker, args=("reference", 2)),
+        ]
+
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        self.assertEqual(len(errors), 0, f"Errors: {errors}")
+        self.assertEqual(len(results), 3)
+        print("    [PASS] Concurrent policy access")
+
+    def test_policy_epoch_increment(self):
+        """Test that policy epoch increments correctly under contention"""
+        from transformer_engine.plugin.core.policy import (
+            get_policy_epoch,
+            bump_policy_epoch,
+        )
+
+        initial_epoch = get_policy_epoch()
+        increments = 100
+        threads_count = 4
+
+        def bump_epochs():
+            for _ in range(increments):
+                bump_policy_epoch()
+
+        threads = [threading.Thread(target=bump_epochs) for _ in range(threads_count)]
+
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        final_epoch = get_policy_epoch()
+        expected = initial_epoch + (increments * threads_count)
+
+        self.assertEqual(final_epoch, expected)
+        print("    [PASS] Policy epoch increment under contention")
+
+
+class TestDefaultOrder(unittest.TestCase):
+    """Test default selection order based on preference"""
+
+    def test_flagos_preference_order(self):
+        """Test selection order with flagos preference"""
+        from transformer_engine.plugin.core.policy import SelectionPolicy
+
+        policy = SelectionPolicy.from_dict(prefer="flagos")
+        order = policy.get_default_order()
+
+        self.assertEqual(order, ["flagos", "vendor", "reference"])
+        print("    [PASS] Flagos preference order")
+
+    def test_vendor_preference_order(self):
+        """Test selection order with vendor preference"""
+        from transformer_engine.plugin.core.policy import SelectionPolicy
+
+        policy = SelectionPolicy.from_dict(prefer="vendor")
+        order = policy.get_default_order()
+
+        self.assertEqual(order, ["vendor", "flagos", "reference"])
+        print("    [PASS] Vendor preference order")
+
+    def test_reference_preference_order(self):
+        """Test selection order with reference preference"""
+        from transformer_engine.plugin.core.policy import SelectionPolicy
+
+        policy = SelectionPolicy.from_dict(prefer="reference")
+        order = policy.get_default_order()
+
+        self.assertEqual(order, ["reference", "flagos", "vendor"])
+        print("    [PASS] Reference preference order")
+
+
+def run_all_tests():
+    """Run all policy tests"""
+    print("\n" + "=" * 60)
+    print("TE-FL Scheduling Policy Test Suite")
+    print("=" * 60)
+
+    # Create test suite
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+
+    # Add test classes
+    test_classes = [
+        TestSelectionPolicy,
+        TestPolicyManager,
+        TestEnvironmentVariables,
+        TestContextManagers,
+        TestTokenMatching,
+        TestThreadSafety,
+        TestDefaultOrder,
+    ]
+
+    for test_class in test_classes:
+        print(f"\n[Testing {test_class.__name__}]")
+        tests = loader.loadTestsFromTestCase(test_class)
+        for test in tests:
+            result = unittest.TestResult()
+            test.run(result)
+            if result.wasSuccessful():
+                pass  # Print statements are in individual tests
+            else:
+                for failure in result.failures + result.errors:
+                    print(f"    [FAIL] {test}: {failure[1]}")
+        suite.addTests(tests)
+
+    # Run the full suite for final summary
+    print("\n" + "=" * 60)
+    print("Final Summary")
+    print("=" * 60)
+
+    runner = unittest.TextTestRunner(verbosity=0)
+    result = runner.run(suite)
+
+    total = result.testsRun
+    failures = len(result.failures)
+    errors = len(result.errors)
+    passed = total - failures - errors
+
+    print(f"\nTotal: {total}, Passed: {passed}, Failed: {failures}, Errors: {errors}")
+
+    return failures == 0 and errors == 0
+
+
+def main():
+    """Main entry point"""
+    success = run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 98b26ba81b..67d3472e5a 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -58,7 +58,7 @@
 from transformer_engine.pytorch.attention.dot_product_attention.backends import (
     UnfusedDotProductAttention,
     FusedAttention,
-    FlashAttention
+    FlashAttention,
 )
 
 # Save reference to native FlashAttention for fallback

From ec8edfcd80a22aca88b06d2c810c2d8d93faaabb Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Sun, 4 Jan 2026 22:46:11 +0800
Subject: [PATCH 18/59] Polish readme (#11)

---
 README.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.rst b/README.rst
index 50c1dcd807..d82c1f6da8 100644
--- a/README.rst
+++ b/README.rst
@@ -5,6 +5,9 @@
 
 |License|
 
+
+**TransformerEngine-FL is a fork of TransformerEngine that introduces a plugin-based architecture for supporting diverse AI chips, built on top of** `FlagOS <https://github.com/flagos-ai>`_, **a unified open-source AI system software stack.**
+
 Transformer Engine
 ==================
 

From b26b226d055c9f9461446fa6070287a35386537b Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Tue, 6 Jan 2026 15:44:52 +0800
Subject: [PATCH 19/59] Register get_attention_backend for all backends and fix
 FlashAttention fallback (#14)

## Summary
This PR contains two major improvements:

1. **Register `get_attention_backend` function for all backends** (CUDA,
FlagOS, Reference)
   - Added `get_attention_backend` implementation to all backend types
- Ensures consistent attention backend selection across different
hardware platforms

2. **Fix FlashAttention fallback mechanism**
- Removed redundant `_called_impls` dictionary, replaced with simpler
`_last_impl_id` class variable
   - Removed unused `_log_lock` threading lock
   - Simplified implementation tracking and logging logic
- Reduced code complexity and memory overhead while maintaining full
functionality

## Changes
- Updated `FlashAttentionBase` class in `ops.py` to remove redundant
implementation tracking
- Added `get_attention_backend` registration to CUDA, FlagOS, and
Reference backends
- Fixed fallback logic in attention backend selection

## Test Plan
- [x] Code builds successfully
- [x] Existing tests pass
- [x] Manual testing with different backend configurations

## Related Issues
Fixes issues with FlashAttention fallback and improves backend
consistency.
---
 .../dot_product_attention/backends.py         |   4 +-
 .../plugin/core/backends/flagos/flagos.py     |  32 ++
 .../core/backends/flagos/register_ops.py      |   4 +
 .../backends/reference/flash_attention.py     |   2 +-
 .../core/backends/reference/reference.py      |  32 ++
 .../core/backends/reference/register_ops.py   |   3 +
 .../plugin/core/backends/vendor/cuda/cuda.py  |  12 +
 .../backends/vendor/cuda/flash_attention.py   |   2 +-
 .../core/backends/vendor/cuda/register_ops.py |   3 +
 .../plugin/core/logger_manager.py             |   5 +
 transformer_engine/plugin/core/manager.py     |  87 +++--
 transformer_engine/plugin/core/ops.py         | 326 +++++++++++++++++-
 .../dot_product_attention.py                  |   8 +
 13 files changed, 454 insertions(+), 66 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index 699767b7be..39ea3c1e18 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -32,7 +32,6 @@
 import transformer_engine.pytorch.attention.dot_product_attention.utils as dpa_utils
 
 from transformer_engine.plugin.core.ops import FlashAttentionBase
-from transformer_engine.plugin.core.logger_manager import print_once
 
 import flag_gems
 
@@ -231,7 +230,6 @@ def __init__(
             layer_number=layer_number,
             deterministic=deterministic,
         )
-
         self.use_FAv2_bwd = os.getenv(
             "NVTE_FUSED_ATTN_USE_FAv2_BWD", "0"
         ) == "1" and get_device_compute_capability() == (9, 0)
@@ -255,7 +253,7 @@ def backend_name(self) -> str:
         return "flagos"
 
     @no_torch_dynamo()
-    def forward(
+    def _forward_impl(
         self,
         query_layer: torch.Tensor,
         key_layer: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index f206d7d7f6..22d36e9e21 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -32,6 +32,38 @@ def get_flash_attention_class(self):
         from .attention.dot_product_attention.backends import FlashAttentionFL
         return FlashAttentionFL
 
+    def get_attention_backend(self, attention_params=None):
+        from packaging.version import Version as PkgVersion
+        from ...logger_manager import get_logger
+        logger = get_logger()
+
+        # Read environment variables to determine which backends to enable
+        use_flash_attention = int(os.getenv("NVTE_FLASH_ATTN", "1"))
+        use_fused_attention = int(os.getenv("NVTE_FUSED_ATTN", "1"))
+        use_unfused_attention = int(os.getenv("NVTE_UNFUSED_ATTN", "1"))
+
+        # Log disabled backends
+        if not use_flash_attention:
+            logger.info_once("Disabling FlashAttention due to NVTE_FLASH_ATTN=0")
+        if not use_fused_attention:
+            logger.info_once("Disabling FusedAttention due to NVTE_FUSED_ATTN=0")
+        if not use_unfused_attention:
+            logger.info_once("Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0")
+
+        flash_attention_backend = PkgVersion("2.6.0") if use_flash_attention else None
+        fused_attention_backend = NVTE_Fused_Attn_Backend.NVTE_No_Backend
+
+        available_backends = [use_flash_attention, use_fused_attention, use_unfused_attention]
+
+        return (
+            use_flash_attention,
+            flash_attention_backend,
+            use_fused_attention,
+            fused_attention_backend,
+            use_unfused_attention,
+            available_backends,
+        )
+
     def generic_gemm(
         self,
         A: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
index 5e2242f70a..1286f5b3a9 100644
--- a/transformer_engine/plugin/core/backends/flagos/register_ops.py
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -49,6 +49,10 @@ def register_builtins(registry) -> None:
 
         # FlashAttention class getter
         OpImpl(op_name="get_flash_attention_class", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=150),
+
+        # Attention backend selection
+        OpImpl(op_name="get_attention_backend", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="get_fused_attn_backend", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor=None, priority=150),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/reference/flash_attention.py b/transformer_engine/plugin/core/backends/reference/flash_attention.py
index 02aa0754fb..833cde97d6 100644
--- a/transformer_engine/plugin/core/backends/reference/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/reference/flash_attention.py
@@ -176,7 +176,7 @@ def _pack_tensor(
 
         return packed_tensor
 
-    def forward(
+    def _forward_impl(
         self,
         query_layer: torch.Tensor,
         key_layer: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
index 56da602f8e..61a0bdaab5 100644
--- a/transformer_engine/plugin/core/backends/reference/reference.py
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -44,6 +44,38 @@ def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionTorch
         return FlashAttentionTorch
 
+    def get_attention_backend(self, attention_params=None):
+        from packaging.version import Version as PkgVersion
+        from ...logger_manager import get_logger
+        logger = get_logger()
+
+        # Read environment variables to determine which backends to enable
+        use_flash_attention = int(os.getenv("NVTE_FLASH_ATTN", "1"))
+        use_fused_attention = int(os.getenv("NVTE_FUSED_ATTN", "1"))
+        use_unfused_attention = int(os.getenv("NVTE_UNFUSED_ATTN", "1"))
+
+        # Log disabled backends
+        if not use_flash_attention:
+            logger.info_once("Disabling FlashAttention due to NVTE_FLASH_ATTN=0")
+        if not use_fused_attention:
+            logger.info_once("Disabling FusedAttention due to NVTE_FUSED_ATTN=0")
+        if not use_unfused_attention:
+            logger.info_once("Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0")
+
+        flash_attention_backend = PkgVersion("2.6.0") if use_flash_attention else None
+        fused_attention_backend = NVTE_Fused_Attn_Backend.NVTE_No_Backend
+
+        available_backends = [use_flash_attention, use_fused_attention, use_unfused_attention]
+
+        return (
+            use_flash_attention,
+            flash_attention_backend,
+            use_fused_attention,
+            fused_attention_backend,
+            use_unfused_attention,
+            available_backends,
+        )
+
     def generic_gemm(
         self,
         A: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/reference/register_ops.py b/transformer_engine/plugin/core/backends/reference/register_ops.py
index 43a652843d..3d311a6c75 100644
--- a/transformer_engine/plugin/core/backends/reference/register_ops.py
+++ b/transformer_engine/plugin/core/backends/reference/register_ops.py
@@ -192,6 +192,9 @@ def register_builtins(registry) -> None:
 
         # FlashAttention class getter
         OpImpl(op_name="get_flash_attention_class", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=50),
+
+        # Attention backend selection
+        OpImpl(op_name="get_attention_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor=None, priority=50),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 33cc4d5b68..98ef965811 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -202,6 +202,18 @@ def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionCUDA
         return FlashAttentionCUDA
 
+    def get_attention_backend(self, attention_params=None):
+        """
+        CUDA backend uses the default attention backend selection logic.
+        This allows hardware-specific checks and optimizations for CUDA devices.
+        Returns:
+            Tuple of (use_flash_attention, flash_attention_backend, use_fused_attention,
+                     fused_attention_backend, use_unfused_attention, available_backends)
+        """
+        # Import the original get_attention_backend function
+        from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
+        return dpa_utils._original_get_attention_backend(attention_params)
+
     def quantize(
         self,
         tensor: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
index 9a972a07d2..95b0aca37c 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
@@ -72,7 +72,7 @@ def _ensure_native_flash_attn(self):
     def backend_name(self) -> str:
         return "cuda"
 
-    def forward(
+    def _forward_impl(
         self,
         query_layer: torch.Tensor,
         key_layer: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
index eea8999ae9..3beff6331c 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
@@ -197,6 +197,9 @@ def register_builtins(registry) -> None:
 
         # FlashAttention class getter
         OpImpl(op_name="get_flash_attention_class", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="CUDA", priority=100),
+
+        # Attention backend selection
+        OpImpl(op_name="get_attention_backend", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="CUDA", priority=100),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/logger_manager.py b/transformer_engine/plugin/core/logger_manager.py
index 9d13aa2f63..682122c346 100644
--- a/transformer_engine/plugin/core/logger_manager.py
+++ b/transformer_engine/plugin/core/logger_manager.py
@@ -50,6 +50,11 @@ def warning_once(self, message):
             self._printed_once.add(message)
             self.logger.warning(message, stacklevel=2)
 
+    def error_once(self, message):
+        if message not in self._printed_once:
+            self._printed_once.add(message)
+            self.logger.error(message, stacklevel=2)
+
     def debug_once(self, message):
         if message not in self._printed_once:
             self._printed_once.add(message)
diff --git a/transformer_engine/plugin/core/manager.py b/transformer_engine/plugin/core/manager.py
index 51a532f7ec..cd96b35bb0 100644
--- a/transformer_engine/plugin/core/manager.py
+++ b/transformer_engine/plugin/core/manager.py
@@ -346,30 +346,29 @@ def call(self, op_name: str, *args, **kwargs):
             # Original behavior: use cached resolve() and fast-fail
             fn = self.resolve(op_name)
 
-            # Get current impl_id to check if it changed
+            # Get current impl_id and log
             impl_id = self.get_selected_impl_id(op_name)
             last_impl_id = self._called_ops.get(op_name)
 
-            # Log if first call or implementation changed
-            if last_impl_id != impl_id:
-                with self._lock:
-                    # Double-check after acquiring lock
-                    if self._called_ops.get(op_name) != impl_id:
-                        snap = self._registry.snapshot()
-                        for impl in snap.impls_by_op.get(op_name, []):
-                            if impl.impl_id == impl_id:
-                                if last_impl_id is None:
-                                    logger.info(
-                                        f"Op '{op_name}' using '{impl_id}' "
-                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                                    )
-                                else:
-                                    logger.info(
-                                        f"Op '{op_name}' switched from '{last_impl_id}' to '{impl_id}' "
-                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                                    )
-                                break
-                        self._called_ops[op_name] = impl_id
+            # Get impl details for logging
+            snap = self._registry.snapshot()
+            for impl in snap.impls_by_op.get(op_name, []):
+                if impl.impl_id == impl_id:
+                    # Only log if first time or implementation actually changed
+                    if last_impl_id is None:
+                        logger.info_once(
+                            f"Op '{op_name}' using '{impl_id}' "
+                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                        )
+                    elif last_impl_id != impl_id:
+                        logger.info_once(
+                            f"Op '{op_name}' switched from '{last_impl_id}' to '{impl_id}' "
+                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                        )
+                    break
+
+            # Update tracking
+            self._called_ops[op_name] = impl_id
 
             return fn(*args, **kwargs)
 
@@ -379,37 +378,31 @@ def call(self, op_name: str, *args, **kwargs):
 
         for idx, impl in enumerate(candidates):
             try:
-                # Log primary implementation or fallback attempts
+                result = impl.fn(*args, **kwargs)
+
+                # Log on success
+                last_impl_id = self._called_ops.get(op_name)
                 if idx == 0:
-                    # Primary implementation
-                    last_impl_id = self._called_ops.get(op_name)
-                    if last_impl_id != impl.impl_id:
-                        with self._lock:
-                            if self._called_ops.get(op_name) != impl.impl_id:
-                                if last_impl_id is None:
-                                    logger.info(
-                                        f"Op '{op_name}' using '{impl.impl_id}' "
-                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                                    )
-                                else:
-                                    logger.info(
-                                        f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
-                                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                                    )
-                                self._called_ops[op_name] = impl.impl_id
+                    # Primary implementation - only log if first time or changed
+                    if last_impl_id is None:
+                        logger.info_once(
+                            f"Op '{op_name}' using '{impl.impl_id}' "
+                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                        )
+                    elif last_impl_id != impl.impl_id:
+                        logger.info_once(
+                            f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
+                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                        )
                 else:
-                    # Always log fallback attempts (these are important runtime events)
-                    logger.info(
+                    # Fallback succeeded
+                    logger.info_once(
                         f"Op '{op_name}' fallback to '{impl.impl_id}' "
                         f"(kind={impl.kind.value}, vendor={impl.vendor})"
                     )
 
-                result = impl.fn(*args, **kwargs)
-
-                # Update tracked impl_id on success (for fallback case)
-                if idx > 0:
-                    with self._lock:
-                        self._called_ops[op_name] = impl.impl_id
+                # Update tracking on success
+                self._called_ops[op_name] = impl.impl_id
 
                 return result
 
@@ -417,7 +410,7 @@ def call(self, op_name: str, *args, **kwargs):
                 last_error = e
                 if idx < len(candidates) - 1:
                     # Not the last candidate, log warning and try next
-                    logger.warning(
+                    logger.warning_once(
                         f"Implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
                     )
                 else:
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 24d89fb65c..50ed6d72a4 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -6,9 +6,13 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Type
 from enum import IntEnum
 from contextlib import nullcontext
-
+import os
+import traceback
 import torch
 
+from .logger_manager import get_logger
+logger = get_logger()
+
 class DType(IntEnum):
     kByte = 0
     kInt32 = 2
@@ -187,6 +191,9 @@ def is_available(self) -> bool:
     def get_flash_attention_class(self) -> Type["FlashAttentionBase"]:
         raise NotImplementedError
 
+    def get_attention_backend(self, attention_params=None):
+        raise NotImplementedError
+
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -1062,6 +1069,9 @@ def create_comm_overlap_p2p(
         raise NotImplementedError
 
 class FlashAttentionBase(torch.nn.Module, ABC):
+    # Class-level tracking for last logged implementation
+    _last_impl_id: Optional[str] = None
+
     def __init__(
         self,
         softmax_scale: float,
@@ -1080,6 +1090,43 @@ def __init__(
         self.layer_number = 1 if layer_number is None else layer_number
         self.deterministic = deterministic
 
+        # For fallback support
+        self._manager = None
+        self._init_params = None
+
+    @abstractmethod
+    def _forward_impl(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        """
+        Actual forward implementation - subclasses must implement this.
+
+        This method contains the backend-specific logic for flash attention.
+        """
+        raise NotImplementedError("Subclasses must implement _forward_impl()")
+
     def forward(
         self,
         query_layer: torch.Tensor,
@@ -1105,7 +1152,252 @@ def forward(
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
     ) -> torch.Tensor:
-        raise NotImplementedError("Subclasses must implement forward()")
+        """
+        Forward pass with automatic fallback support.
+        If TE_FL_STRICT=1 (default), this will automatically try alternative
+        implementations if the primary one fails.
+        """
+        # Check if fallback is enabled
+        enable_fallback = os.getenv("TE_FL_STRICT", "1") != "0"
+
+        # Key for tracking this operation (use op name)
+        layer_key = "get_flash_attention_class"
+
+        # If no manager or fallback disabled, use direct implementation
+        if self._manager is None or not enable_fallback:
+            # Try to get implementation details from manager if available
+            if self._manager is not None:
+                snap = self._manager.registry.snapshot()
+                # Find the impl that matches this instance's class
+                class_name_lower = self.__class__.__name__.lower()
+                impl_id = None
+
+                for impl in snap.impls_by_op.get(layer_key, []):
+                    if impl.impl_id == class_name_lower or class_name_lower.startswith(impl.impl_id):
+                        impl_id = impl.impl_id
+                        break
+
+                # Log using info_once (it handles deduplication)
+                if impl_id is not None:
+                    for impl in snap.impls_by_op.get(layer_key, []):
+                        if impl.impl_id == impl_id:
+                            # Only log if first time or implementation actually changed
+                            if FlashAttentionBase._last_impl_id is None:
+                                logger.info_once(
+                                    f"Op '{layer_key}' using '{impl_id}' "
+                                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                )
+                            elif FlashAttentionBase._last_impl_id != impl_id:
+                                logger.info_once(
+                                    f"Op '{layer_key}' switched from '{FlashAttentionBase._last_impl_id}' to '{impl_id}' "
+                                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                )
+                            break
+                    # Update tracking
+                    FlashAttentionBase._last_impl_id = impl_id
+
+            return self._forward_impl(
+                query_layer=query_layer,
+                key_layer=key_layer,
+                value_layer=value_layer,
+                attention_mask=attention_mask,
+                qkv_layout=qkv_layout,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                attn_mask_type=attn_mask_type,
+                window_size=window_size,
+                alibi_slopes=alibi_slopes,
+                cp_group=cp_group,
+                cp_global_ranks=cp_global_ranks,
+                cp_stream=cp_stream,
+                cp_comm_type=cp_comm_type,
+                fp8=fp8,
+                fp8_meta=fp8_meta,
+                quantizers=quantizers,
+                inference_params=inference_params,
+                flash_attention_backend=flash_attention_backend,
+                fp8_output=fp8_output,
+            )
+
+        # Fallback mode: try candidates in priority order
+        candidates = []
+        try:
+            candidates = self._manager.resolve_candidates(layer_key)
+        except Exception as resolve_error:
+            logger.error(f"Failed to resolve fallback candidates: {resolve_error}")
+            # If we can't get candidates, just try the primary implementation
+            return self._forward_impl(
+                query_layer=query_layer,
+                key_layer=key_layer,
+                value_layer=value_layer,
+                attention_mask=attention_mask,
+                qkv_layout=qkv_layout,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                attn_mask_type=attn_mask_type,
+                window_size=window_size,
+                alibi_slopes=alibi_slopes,
+                cp_group=cp_group,
+                cp_global_ranks=cp_global_ranks,
+                cp_stream=cp_stream,
+                cp_comm_type=cp_comm_type,
+                fp8=fp8,
+                fp8_meta=fp8_meta,
+                quantizers=quantizers,
+                inference_params=inference_params,
+                flash_attention_backend=flash_attention_backend,
+                fp8_output=fp8_output,
+            )
+
+        # Find current implementation's impl_id
+        snap = self._manager.registry.snapshot()
+        current_impl_id = None
+        current_class = self.__class__
+
+        for impl in snap.impls_by_op.get(layer_key, []):
+            try:
+                # Check if this impl creates our current class
+                impl_class = impl.fn()
+                if impl_class == current_class:
+                    current_impl_id = impl.impl_id
+                    break
+            except:
+                continue
+
+        # Try primary implementation first and capture any error
+        primary_error = None
+        try:
+            result = self._forward_impl(
+                query_layer=query_layer,
+                key_layer=key_layer,
+                value_layer=value_layer,
+                attention_mask=attention_mask,
+                qkv_layout=qkv_layout,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                attn_mask_type=attn_mask_type,
+                window_size=window_size,
+                alibi_slopes=alibi_slopes,
+                cp_group=cp_group,
+                cp_global_ranks=cp_global_ranks,
+                cp_stream=cp_stream,
+                cp_comm_type=cp_comm_type,
+                fp8=fp8,
+                fp8_meta=fp8_meta,
+                quantizers=quantizers,
+                inference_params=inference_params,
+                flash_attention_backend=flash_attention_backend,
+                fp8_output=fp8_output,
+            )
+            # Primary implementation succeeded
+            return result
+        except Exception as e:
+            primary_error = e
+            # Log the primary failure
+            error_summary = f"{type(e).__name__}: {str(e)}"
+            logger.warning_once(
+                f"Implementation '{current_impl_id}' failed for op '{layer_key}' "
+                f" - {error_summary}"
+            )
+            # Log full traceback if verbose mode is enabled
+            if os.getenv("TE_FL_VERBOSE_ERROR", "0") == "1":
+                error_traceback = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
+                logger.warning(f"Detailed traceback for '{current_impl_id}':\n{error_traceback}")
+
+        last_error = primary_error
+
+        for idx, impl in enumerate(candidates):
+            # Skip the current implementation (already tried above)
+            if impl.impl_id == current_impl_id:
+                continue
+
+            try:
+                # All attempts here are fallbacks (since we skipped current impl)
+                # Get fallback class and create instance
+                fallback_class = impl.fn()
+                fallback_instance = fallback_class(**self._init_params)
+                # Set manager for nested fallback support
+                fallback_instance._manager = self._manager
+                fallback_instance._init_params = self._init_params
+
+                # Call the implementation directly (not forward, to avoid recursion)
+                result = fallback_instance._forward_impl(
+                    query_layer=query_layer,
+                    key_layer=key_layer,
+                    value_layer=value_layer,
+                    attention_mask=attention_mask,
+                    qkv_layout=qkv_layout,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_kv=cu_seqlens_kv,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_kv=max_seqlen_kv,
+                    attn_mask_type=attn_mask_type,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    cp_group=cp_group,
+                    cp_global_ranks=cp_global_ranks,
+                    cp_stream=cp_stream,
+                    cp_comm_type=cp_comm_type,
+                    fp8=fp8,
+                    fp8_meta=fp8_meta,
+                    quantizers=quantizers,
+                    inference_params=inference_params,
+                    flash_attention_backend=flash_attention_backend,
+                    fp8_output=fp8_output,
+                )
+
+                # Log on fallback success
+                logger.info_once(
+                    f"Op '{layer_key}' fallback to '{impl.impl_id}' "
+                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                )
+
+                # Update tracking on success
+                FlashAttentionBase._last_impl_id = impl.impl_id
+                return result
+
+            except Exception as e:
+                last_error = e
+                # Determine if there are more candidates to try
+                has_more_candidates = any(
+                    c.impl_id != current_impl_id
+                    for c in candidates[idx+1:]
+                )
+
+                # Format error summary
+                error_summary = f"{type(e).__name__}: {str(e)}"
+
+                if has_more_candidates:
+                    logger.warning_once(
+                        f"Implementation '{impl.impl_id}' failed for op '{layer_key}' - {error_summary}"
+                    )
+                else:
+                    # Last candidate failed
+                    logger.error_once(
+                        f"Last implementation '{impl.impl_id}' failed for op '{layer_key}' - {error_summary}"
+                    )
+
+                # Log full traceback if verbose mode is enabled
+                if os.getenv("TE_FL_VERBOSE_ERROR", "0") == "1":
+                    error_traceback = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
+                    log_func = logger.error if not has_more_candidates else logger.warning
+                    log_func(f"Detailed traceback for '{impl.impl_id}':\n{error_traceback}")
+
+        # All implementations failed
+        logger.error(
+            f"All implementations failed for op '{layer_key}'. "
+            f"Original: '{current_impl_id}'"
+        )
+        raise RuntimeError(
+            f"All implementation(s) failed for op='{layer_key}'. "
+            f"Last error: {last_error}"
+        ) from last_error
 
     @property
     def backend_name(self) -> str:
@@ -1123,10 +1415,7 @@ def __init__(self, manager=None):
         """
         # Import here to avoid circular dependency
         from .manager import get_default_manager
-        from .logger_manager import get_logger
-
         self._manager = manager if manager is not None else get_default_manager()
-        self._logger = get_logger()
 
         self.DType = DType
         self.Float8BlockScaleTensorFormat = Float8BlockScaleTensorFormat
@@ -1216,15 +1505,24 @@ def flash_attention(
         # This provides the same fallback support and logging as other operators
         flash_attn_class = self._manager.call("get_flash_attention_class")
 
-        # Instantiate and return the FlashAttention
-        return flash_attn_class(
-            softmax_scale=softmax_scale,
-            attention_dropout=attention_dropout,
-            attention_dropout_ctx=attention_dropout_ctx,
-            attention_type=attention_type,
-            layer_number=layer_number,
-            deterministic=deterministic,
-        )
+        # Prepare initialization parameters
+        init_params = {
+            'softmax_scale': softmax_scale,
+            'attention_dropout': attention_dropout,
+            'attention_dropout_ctx': attention_dropout_ctx,
+            'attention_type': attention_type,
+            'layer_number': layer_number,
+            'deterministic': deterministic,
+        }
+
+        # Instantiate the FlashAttention
+        instance = flash_attn_class(**init_params)
+
+        # Set manager and init_params for fallback support
+        instance._manager = self._manager
+        instance._init_params = init_params
+
+        return instance
 
     def __repr__(self) -> str:
         op_count = len(self._manager.registry.list_operators())
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 67d3472e5a..d62bcc92ac 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -61,10 +61,18 @@
     FlashAttention,
 )
 
+#########################################################################
 # Save reference to native FlashAttention for fallback
 _FlashAttentionNative = FlashAttention
 # Use plugin system's flash_attention if available, otherwise use native
 FlashAttention = getattr(tex, 'flash_attention', _FlashAttentionNative)
+# Save the original get_attention_backend for backends that want to use default logic
+# CUDA backend can access this via dpa_utils._original_get_attention_backend
+dpa_utils._original_get_attention_backend = dpa_utils.get_attention_backend
+# Replace dpa_utils.get_attention_backend with tex.get_attention_backend
+# This allows each backend (FlagOS, CUDA, Reference) to control its own backend selection
+dpa_utils.get_attention_backend = tex.get_attention_backend
+#########################################################################
 
 # Setup Attention Logging
 attn_log.setup_logging()

From a423680f90c6079e77be41aff7becc96fd10aef6 Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Wed, 7 Jan 2026 11:51:35 +0800
Subject: [PATCH 20/59] fix nv shared lib bug. (#16)

# Description

fix nv shared lib bug

[CUDA] Import failed: No module named 'transformer_engine_torch_nv'

Fixes # (issue)

## Type of change

- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Change A
- Change B

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes
---
 transformer_engine/common/__init__.py  | 9 ++-------
 transformer_engine/pytorch/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index 649674a281..e3cb298963 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -132,7 +132,7 @@ def _get_shared_object_file(library: str) -> Path:
     """
 
     # Check provided input and determine the correct prefix for .so.
-    assert library in ("core", "torch", "jax"), f"Unsupported TE library {library}."
+    assert library in ("core", "torch_nv", "jax"), f"Unsupported TE library {library}."
     if library == "core":
         so_prefix = "libtransformer_engine"
     else:
@@ -183,12 +183,7 @@ def load_framework_extension(framework: str) -> None:
         return
 
     # Supported frameworks.
-    assert framework in ("jax", "torch"), f"Unsupported framework {framework}"
-
-    # For torch: plugin system already handles transformer_engine_torch
-    # The native module is transformer_engine_torch_nv (imported by NVIDIA backend)
-    if framework == "torch":
-        return  # Nothing to do, plugin system handles this
+    assert framework in ("jax", "torch_nv"), f"Unsupported framework {framework}"
 
     # For jax: load the native module as before
     module_name = f"transformer_engine_{framework}"
diff --git a/transformer_engine/pytorch/__init__.py b/transformer_engine/pytorch/__init__.py
index 77c71b8119..fff2541fa1 100644
--- a/transformer_engine/pytorch/__init__.py
+++ b/transformer_engine/pytorch/__init__.py
@@ -23,7 +23,7 @@ def torch_version() -> tuple[int, ...]:
 assert torch_version() >= (2, 1), f"Minimum torch version 2.1 required. Found {torch_version()}."
 
 
-load_framework_extension("torch")
+load_framework_extension("torch_nv")
 from transformer_engine.pytorch.module import LayerNormLinear
 from transformer_engine.pytorch.module import Linear
 from transformer_engine.pytorch.module import LayerNormMLP

From fbe34bdadd2d31d8a2e463034d6d3d18500f8ef1 Mon Sep 17 00:00:00 2001
From: wendell <wendell@wenobug.com>
Date: Mon, 12 Jan 2026 11:11:44 +0800
Subject: [PATCH 21/59] Add a new vendor implementation named hygon (#15)

# Description

This pr add hygon backend for calling basic ops on hygon dcu.

## Type of change

- [x] New feature (non-breaking change which adds functionality)

## Changes

Please list the changes introduced in this PR:

- Add a new `hygon` folder in `vendor` contains `__init__.py`,
`hygon.py`, `register_ops.py`
- Register hygon ops in `builtin_ops.py`

# Requirements

In order to use hygon backend, the following, the following requirements
need to be met

- The python package `transformer_engine_fl_hygon` needs to be installed

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes

---------

Signed-off-by: wenjh <wenjh@sugon.com>
---
 .../core/backends/vendor/hygon/__init__.py    |   7 +
 .../core/backends/vendor/hygon/hygon.py       | 976 ++++++++++++++++++
 .../backends/vendor/hygon/register_ops.py     | 191 ++++
 transformer_engine/plugin/core/builtin_ops.py |   8 +
 4 files changed, 1182 insertions(+)
 create mode 100644 transformer_engine/plugin/core/backends/vendor/hygon/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py

diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py b/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py
new file mode 100644
index 0000000000..331c70c649
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .hygon import HygonBackend
+
+__all__ = ["HygonBackend"]
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
new file mode 100644
index 0000000000..4d74e2f4cf
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -0,0 +1,976 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import sys
+
+from ....ops import TEFLBackendBase, FP8TensorMeta
+
+def _load_hygon_libs():
+    import ctypes
+    from pathlib import Path
+    import importlib
+    import platform
+    import os
+    common_prefix = "libtransformer_engine"
+    csrc_prefix = "transformer_engine_torch_hygon"
+    common_files = []
+    csrc_files = []
+    def _get_sys_extension() -> str:
+        system = platform.system()
+        if system == "Linux":
+            return ".so"
+        if system == "Darwin":
+            return ".dylib"
+        if system == "Windows":
+            return ".dll"
+        raise RuntimeError(f"Unsupported operating system ({system})")
+    try:
+        if bool(int(os.environ.get("TE_FL_SKIP_HYGON", "0"))):
+            return False
+        ext = _get_sys_extension()
+        hygon_spec = importlib.util.find_spec("transformer_engine_hygon")
+        if hygon_spec is None:
+            return False
+        hygon_path = Path(hygon_spec.origin).parent
+        for file_path in hygon_path.iterdir():
+            if file_path.name.startswith(common_prefix) and file_path.suffix == ext:
+                common_files.append(file_path)
+            if file_path.name.startswith(csrc_prefix) and file_path.suffix == ext:
+                csrc_files.append(file_path)
+        if len(common_files) == 0:
+            return False
+        if len(csrc_files) == 0:
+            return False
+        ctypes.CDLL(str(common_files[0]), mode=ctypes.RTLD_GLOBAL)
+        spec = importlib.util.spec_from_file_location(csrc_prefix, csrc_files[0])
+        solib = importlib.util.module_from_spec(spec)
+        sys.modules[csrc_prefix] = solib
+        spec.loader.exec_module(solib)
+        return True
+    except Exception as e:
+        print(f"[HYGON] Failed to load hygon libs: {e}")
+        return False
+
+_hygon_libs_loaded = False
+
+def _ensure_hygon_libs():
+    global _hygon_libs_loaded
+    if not _hygon_libs_loaded:
+        _hygon_libs_loaded = _load_hygon_libs()
+    return _hygon_libs_loaded
+
+def _check_hygon_available() -> bool:
+    try:
+        if not _ensure_hygon_libs():
+            return False
+        import transformer_engine_torch_hygon
+        return True
+    except (ImportError, OSError) as e:
+        print(f"[HYGON] Import failed: {e}")
+        return False
+
+def _get_tex():
+    _ensure_hygon_libs()
+    import transformer_engine_torch_hygon
+    return transformer_engine_torch_hygon
+
+def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
+    if torch_dtype is None:
+        return None
+
+    NativeDType = tex_module.DType
+    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_torch_hygon':
+        return torch_dtype
+
+    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
+        from transformer_engine.plugin.core.ops import DType as PyDType
+        if isinstance(torch_dtype, PyDType):
+            dtype_name = torch_dtype.name
+            if hasattr(NativeDType, dtype_name):
+                return getattr(NativeDType, dtype_name)
+
+    dtype_map = {
+        torch.float32: NativeDType.kFloat32,
+        torch.float16: NativeDType.kFloat16,
+        torch.bfloat16: NativeDType.kBFloat16,
+        torch.int32: NativeDType.kInt32,
+        torch.uint8: NativeDType.kByte,
+    }
+
+    if hasattr(torch, 'float8_e4m3fn'):
+        dtype_map[torch.float8_e4m3fn] = NativeDType.kFloat8E4M3
+    if hasattr(torch, 'float8_e5m2'):
+        dtype_map[torch.float8_e5m2] = NativeDType.kFloat8E5M2
+
+    return dtype_map.get(torch_dtype, torch_dtype)
+
+def _convert_dtype_params(func):
+    import functools
+    import inspect
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        dtype_params = ['otype', 'output_dtype', 'bias_type']
+
+        from transformer_engine.plugin.core.ops import DType as PyDType
+
+        def needs_conversion(val):
+            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
+
+        for param_name in dtype_params:
+            if param_name in kwargs:
+                value = kwargs[param_name]
+                if needs_conversion(value):
+                    converted = self._to_te_dtype(value)
+                    kwargs[param_name] = converted
+
+        sig = inspect.signature(func)
+        param_names = list(sig.parameters.keys())[1:]
+
+        args_list = list(args)
+        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
+            if param_name in dtype_params and needs_conversion(arg_value):
+                converted = self._to_te_dtype(arg_value)
+                args_list[i] = converted
+
+        return func(self, *args_list, **kwargs)
+
+    return wrapper
+
+class HygonBackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return _check_hygon_available()
+
+    def __init__(self):
+        self._tex = None
+
+    def _get_tex(self):
+        if self._tex is None:
+            self._tex = _get_tex()
+        return self._tex
+
+    def _to_te_dtype(self, torch_dtype):
+        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
+
+    def is_available(self) -> bool:
+        return _check_hygon_available()
+
+    def get_flash_attention_class(self):
+        raise NotImplementedError("get_flash_attention_class - not implemented in hygon backend")
+
+    def get_attention_backend(self, attention_params=None):
+        raise NotImplementedError("get_attention_backend - not implemented in hygon backend")
+
+    def quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        output: Optional[torch.Tensor] = None,
+        noop: Optional[torch.Tensor] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.quantize(tensor, quantizer, output, noop)
+
+    @_convert_dtype_params
+    def dequantize(
+        self,
+        input: torch.Tensor,
+        otype: torch.dtype,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dequantize(input, otype)
+
+    def bgrad_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.bgrad_quantize(input, quantizer)
+
+    @_convert_dtype_params
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        tex = self._get_tex()
+
+        if bias_type is None:
+            bias_type = self._to_te_dtype(torch.bfloat16)
+
+        return tex.generic_gemm(
+            A, transA, B, transB, D, quantizer, output_dtype,
+            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
+            accumulate, use_split_accumulator, comm_overlap, comm_type,
+            extra_output, bulk_overlap, alpha, beta
+        )
+
+    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm(*args, **kwargs)
+
+    def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.gelu(input, quantizer)
+
+    def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.geglu(input, quantizer)
+
+    def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgelu(input, quantizer)
+
+    def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgeglu(input, quantizer)
+
+    def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.relu(input, quantizer)
+
+    def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.reglu(input, quantizer)
+
+    def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.srelu(input, quantizer)
+
+    def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.sreglu(input, quantizer)
+
+    def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.silu(input, quantizer)
+
+    def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.swiglu(input, quantizer)
+
+    def clamped_swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
+    def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgelu(grad, fwd_input, quantizer)
+
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgeglu(grad, fwd_input, quantizer)
+
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgelu(grad, fwd_input, quantizer)
+    
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgeglu(grad, fwd_input, quantizer)
+
+    def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.drelu(grad, fwd_input, quantizer)
+
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dreglu(grad, fwd_input, quantizer)
+
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsrelu(grad, fwd_input, quantizer)
+
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsreglu(grad, fwd_input, quantizer)
+
+    def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsilu(grad, fwd_input, quantizer)
+
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dswiglu(grad, fwd_input, quantizer)
+
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_drelu(grad, fwd_input, quantizer)
+
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dqgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsrelu(grad, fwd_input, quantizer)
+
+    @_convert_dtype_params
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, mu, rsigma = tex.layernorm_fwd(
+            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+        return y, mu, rsigma
+
+    def layernorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dgamma, dbeta
+
+    @_convert_dtype_params
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, y_quant, rsigma = tex.rmsnorm_fwd(
+            input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+            if y_quant is not None:
+                y_quant = y_quant.view(*orig_shape)
+        return y, y_quant, rsigma
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dw
+
+    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.rmsnorm_bwd_add(*args, **kwargs)
+
+    def multi_tensor_quantize(
+        self,
+        tensor_list: List[torch.Tensor],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
+    def split_quantize(
+        self,
+        tensor: torch.Tensor,
+        split_sections: List[int],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.split_quantize(tensor, split_sections, quantizer_list)
+
+    def moe_permute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_permute_fwd(*args, **kwargs)
+
+    def moe_permute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_permute_bwd(*args, **kwargs)
+
+    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_unpermute_fwd(*args, **kwargs)
+
+    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_unpermute_bwd(*args, **kwargs)
+
+    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+
+    def scaled_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale)
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
+
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
+
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        raise NotImplementedError("get_fused_attn_backend - not implemented in hygon backend")
+
+    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_attn_fwd - not implemented in hygon backend")
+
+    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("fused_attn_bwd - not implemented in hygon backend")
+
+    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_fwd(*args, **kwargs)
+
+    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_bwd(*args, **kwargs)
+
+    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.copy_to_kv_cache(*args, **kwargs)
+
+    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_thd_to_bshd(*args, **kwargs)
+
+    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_bshd_to_thd(*args, **kwargs)
+
+    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_rope_forward(*args, **kwargs)
+
+    def fused_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_rope_backward(*args, **kwargs)
+
+    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_forward(*args, **kwargs)
+
+    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_backward(*args, **kwargs)
+
+    def fused_topk_with_score_function_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        num_groups: int,
+        group_topk: int,
+        scaling_factor: float,
+        score_function: Any,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_fwd(
+            logits, topk, use_pre_softmax, num_groups, group_topk,
+            scaling_factor, score_function, expert_bias
+        )
+
+    def fused_topk_with_score_function_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        routing_map: torch.Tensor,
+        intermediate_output: torch.Tensor,
+        grad_probs: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        scaling_factor: float,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_bwd(
+            num_tokens, num_experts, routing_map, intermediate_output,
+            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+        )
+
+    def fused_score_for_moe_aux_loss_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
+
+    def fused_score_for_moe_aux_loss_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        intermediate_output: torch.Tensor,
+        grad_scores: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_bwd(
+            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+        )
+
+    def fused_moe_aux_loss_fwd(
+        self,
+        probs: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        total_num_tokens: int,
+        num_experts: int,
+        num_rows: int,
+        num_cols: int,
+        topk: int,
+        coeff: float,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_fwd(
+            probs, tokens_per_expert, total_num_tokens, num_experts,
+            num_rows, num_cols, topk, coeff
+        )
+
+    def fused_moe_aux_loss_bwd(
+        self,
+        Const_buf: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        num_rows: int,
+        num_cols: int,
+        grad_aux_loss: torch.Tensor,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
+
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.dropout_fwd(input, dropout_probability, out)
+
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
+
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: Any,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_transpose(input, dtype, out=out)
+
+    def swap_first_dims(
+        self,
+        tensor: torch.Tensor,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.swap_first_dims(tensor, out=out)
+
+    def compute_amax(
+        self,
+        input: torch.Tensor,
+        amax: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.compute_amax(input, amax)
+
+    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
+
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
+
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: Any,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
+
+    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_padding(*args, **kwargs)
+
+    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(*args, **kwargs)
+
+    def get_cublasLt_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cublasLt_version()
+
+    def get_cudnn_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cudnn_version()
+
+    def get_num_cublas_streams(self) -> int:
+        tex = self._get_tex()
+        return tex.get_num_cublas_streams()
+
+    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_half_tensor(*args, **kwargs)
+
+    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_second_half_lse_correction(*args, **kwargs)
+
+    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_second_half_lse(*args, **kwargs)
+
+    def thd_out_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_out_correction(*args, **kwargs)
+
+    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_grad_correction(*args, **kwargs)
+
+    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_get_partitioned_indices(*args, **kwargs)
+
+    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+        raise NotImplementedError("init_nvshmem_backend - not implemented in hygon backend")
+
+    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("create_nvshmem_tensor - not implemented in hygon backend")
+
+    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        raise NotImplementedError("nvshmem_send_on_current_stream - not implemented in hygon backend")
+
+    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        raise NotImplementedError("nvshmem_wait_on_current_stream - not implemented in hygon backend")
+
+    def nvshmem_finalize(self) -> None:
+        raise NotImplementedError("nvshmem_finalize - not implemented in hygon backend")
+
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        tex = self._get_tex()
+        if chunk_size is None:
+            return tex.multi_tensor_adam
+        tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
+            eps, step, mode, bias_correction, weight_decay
+        )
+
+    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
+
+    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_fp8(*args, **kwargs)
+
+    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable(*args, **kwargs)
+
+    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
+
+    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_sgd(*args, **kwargs)
+
+    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+
+    def bulk_overlap_ag_with_external_gemm(
+        self,
+        allgather_communicator: Any,
+        send_stream: Any,
+        recv_stream: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        tex = self._get_tex()
+        return tex.FP8TensorMeta()
+
+    def create_comm_overlap_helper(
+        self,
+        world_group: Optional[Any] = None,
+        intra_node_group: Optional[Any] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        if world_group is None:
+            return tex.CommOverlapHelper()
+        return tex.CommOverlapHelper(world_group, intra_node_group)
+
+    def create_comm_overlap(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        num_splits: int = 3,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 2,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 16,
+        set_sm_margin: bool = True,
+        atomic_gemm: bool = False,
+        rs_overlap_first_gemm: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlap(
+            buffer_shape, buffer_dtype, helper, tp_size,
+            num_splits, num_max_streams, comm_cga_size,
+            gemm_priority, comm_priority, num_comm_sm,
+            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+        )
+
+    def create_comm_overlap_p2p(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        comm_type: Any,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 1,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 1,
+        set_sm_margin: bool = False,
+        atomic_gemm: bool = False,
+        use_ce: bool = True,
+        aggregate: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlapP2P(
+            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
+            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
+            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
new file mode 100644
index 0000000000..59cbe0ac5d
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Hygon vendor backend operator registrations.
+
+This module registers all VENDOR (Hygon) implementations from transformer_engine_torch.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ....types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all Hygon (VENDOR) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    # Import Hygon backend to get all the wrapped tex functions
+    from .hygon import HygonBackend
+
+    # Create a backend instance to access the methods
+    backend = HygonBackend()
+
+    # Check if Hygon is available before registering
+    if not backend.is_available():
+        return
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # Normalization
+        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="layernorm_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="layernorm_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="HYGON", priority=100),
+
+        # GEMM
+        OpImpl(op_name="generic_gemm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="HYGON", priority=100),
+
+        # Quantization
+        OpImpl(op_name="quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dequantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="bgrad_quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="split_quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="HYGON", priority=100),
+
+        # Activations - Forward
+        OpImpl(op_name="gelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="geglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="qgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="qgeglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="relu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="reglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="srelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="sreglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="silu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="swiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="clamped_swiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="HYGON", priority=100),
+
+        # Activations - Backward
+        OpImpl(op_name="dgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dgeglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dqgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dqgeglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="drelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dreglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dsrelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dsreglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dsilu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dswiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="HYGON", priority=100),
+
+        # Activations - Bias + Backward
+        OpImpl(op_name="dbias_dgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dbias_dsilu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dbias_drelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="HYGON", priority=100),
+
+        # Softmax
+        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="HYGON", priority=100),
+
+        # MOE operations
+        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="HYGON", priority=100),
+
+        # Fused attention
+
+        # KV cache
+        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="HYGON", priority=100),
+
+        # Tensor format conversions
+        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="HYGON", priority=100),
+
+        # RoPE (Rotary Position Embedding)
+        OpImpl(op_name="fused_rope_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_rope_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="HYGON", priority=100),
+
+        # TopK and MOE aux loss
+        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="HYGON", priority=100),
+
+        # Dropout
+        OpImpl(op_name="dropout_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="dropout_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="HYGON", priority=100),
+
+        # FP8 operations
+        OpImpl(op_name="fp8_transpose", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="swap_first_dims", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="compute_amax", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="HYGON", priority=100),
+
+        # Padding operations
+        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="HYGON", priority=100),
+
+        # Library version getters
+        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="get_cudnn_version", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="HYGON", priority=100),
+
+        # THD (Tensor, Hidden, Dimension) operations
+        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="thd_out_correction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="thd_grad_correction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="HYGON", priority=100),
+
+        # NVSHMEM operations
+
+        # Multi-tensor operations
+        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="HYGON", priority=100),
+
+        # Communication overlap operations
+        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="create_comm_overlap", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="HYGON", priority=100),
+
+        # FlashAttention class getter
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
index 408e6ed8c1..a79ca3016a 100644
--- a/transformer_engine/plugin/core/builtin_ops.py
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -47,3 +47,11 @@ def register_builtins(registry: OpRegistry) -> None:
     except Exception as e:
         # CUDA may not be available, this is expected
         pass
+
+    # Register HYGON (VENDOR) implementations
+    try:
+        from .backends.vendor.hygon.register_ops import register_builtins as register_hygon
+        register_hygon(registry)
+    except Exception as e:
+        # HYGON may not be available, this is expected
+        pass

From 396794ecb8cea8db8df29d3f902434cc8386d9d1 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Mon, 12 Jan 2026 11:48:04 +0800
Subject: [PATCH 22/59] Update the way the gems context is invoked in the
 FlagOS Backend (#18)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a flag that permanently enables flag_gems with a single switch,
eliminating the need to call flag_gems.use_gems for every single
operator. This removes significant registration overhead and improves
end-to-end throughput.
- When the flag is set, every operator’s implementation is forced to use
flag_os/vendor; the default PyTorch reference backend is unavailable.
- When the flag is not set, operators can freely switch among flag_os,
vendor, and torch backends.
---
 .../dot_product_attention/backends.py         |  6 +++--
 .../core/backends/flagos/impl/fused_adam.py   |  5 ++--
 .../plugin/core/backends/flagos/impl/gemm.py  |  4 ++-
 .../core/backends/flagos/impl/multi_tensor.py |  7 +++--
 .../core/backends/flagos/impl/rmsnorm.py      |  5 ++--
 .../plugin/core/backends/flagos/utils.py      | 27 +++++++++++++++++++
 6 files changed, 45 insertions(+), 9 deletions(-)
 create mode 100644 transformer_engine/plugin/core/backends/flagos/utils.py

diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index 39ea3c1e18..dbed0dc2cf 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -35,6 +35,8 @@
 
 import flag_gems
 
+from transformer_engine.plugin.core.backends.flagos.utils import gems_context
+
 class AttnFuncFL(torch.autograd.Function):
     @staticmethod
     def forward(
@@ -71,7 +73,7 @@ def forward(
 
         is_causal = attn_mask_type == 'causal'
 
-        with flag_gems.use_gems():
+        with gems_context():
             # FlagGems requires contiguous tensors, so we must call contiguous() after permute
             q_permuted = q.permute(1, 2, 0, 3).contiguous()
             k_permuted = k.permute(1, 2, 0, 3).contiguous()
@@ -160,7 +162,7 @@ def backward(ctx, d_out, *_args):
 
             dqkv_te_dtype = TE_DType[d_out.dtype]
 
-            with flag_gems.use_gems():
+            with gems_context():
                 # Ensure all tensors are contiguous for FlagGems backward
                 q_permuted = q_permuted.contiguous() if not q_permuted.is_contiguous() else q_permuted
                 k_permuted = k_permuted.contiguous() if not k_permuted.is_contiguous() else k_permuted
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index 1edd361f95..867ee1a101 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -5,7 +5,7 @@
 from typing import Optional, List
 import torch
 import flag_gems
-
+from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 def multi_tensor_adam_fl(
     chunk_size: int,
@@ -22,7 +22,8 @@ def multi_tensor_adam_fl(
     inv_scale: Optional[float] = 1.0,
     out_dtype: Optional[torch.dtype] = None,
 ) -> None:
-    with flag_gems.use_gems():
+
+    with gems_context():
         num_lists = len(tensor_lists)
         assert num_lists in [4, 5], f"Expected 4 or 5 tensor lists, got {num_lists}"
 
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
index a52af3d4c2..57f40bfffc 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
@@ -6,6 +6,7 @@
 import torch
 
 import flag_gems
+from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 __all__ = [
     "generic_gemm_fl",
@@ -63,7 +64,8 @@ def generic_gemm_fl(
     alpha: float = 1.0,
     beta: Optional[float] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
-    with flag_gems.use_gems():
+
+    with gems_context():
         assert not gelu and gelu_in is None, "Triton-Based General Gemm do not support gelu now"
         assert quantizer is None, "Triton-Based General Gemm do not support quantization now"
         assert bias is None, "Triton-Based General Gemm do not support bias now"
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
index 9d3e6959b6..4f7e6e907b 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -6,9 +6,11 @@
 from torch.distributed._tensor import DTensor
 import flag_gems
 
+from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
-    with flag_gems.use_gems():
+
+    with gems_context():
         tensors = tensor_lists[0]
 
         if per_tensor:
@@ -21,6 +23,7 @@ def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *ar
 
 
 def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
-    with flag_gems.use_gems():
+
+    with gems_context():
         for src, dst in zip(tensor_lists[0], tensor_lists[1]):
             dst.copy_(src * scale)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
index ddf70f2c70..a4358c3d7e 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
@@ -5,6 +5,7 @@
 import torch
 import flag_gems
 
+from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 def rmsnorm_fwd_fl(
     input,
@@ -16,7 +17,7 @@ def rmsnorm_fwd_fl(
     sm_margin,
     zero_centered_gamma,
 ):
-    with flag_gems.use_gems():
+    with gems_context():
         if zero_centered_gamma:
             weight_adj = 1 + weight
         else:
@@ -44,7 +45,7 @@ def rmsnorm_bwd_fl(
     zero_centered_gamma,
     eps,
 ):
-    with flag_gems.use_gems():
+    with gems_context():
         # When zero_centered_gamma is True, forward uses (1 + gamma) as weight
         # So backward needs to use (1 + gamma) for computing dx
         if zero_centered_gamma:
diff --git a/transformer_engine/plugin/core/backends/flagos/utils.py b/transformer_engine/plugin/core/backends/flagos/utils.py
new file mode 100644
index 0000000000..cb0547c190
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/utils.py
@@ -0,0 +1,27 @@
+import os
+from contextlib import nullcontext
+
+
+def gems_context():
+    # check if flagos should be enabled permanently via environment variable
+    flag_gems_global_registrar = None
+    try:
+        import flag_gems
+        flag_gems_global_registrar = getattr(flag_gems, 'current_work_registrar', None)
+    except Exception as e:
+        from ...logger_manager import get_logger
+        logger = get_logger()
+        logger.warning(f"Failed to get flag gems registrar: {e}")
+        
+    is_flag_gems_global_enabled = flag_gems_global_registrar is not None
+
+    # Check if flagos should be enabled permanently via environment variable
+    enable_flagos_permanently = os.getenv("TE_FL_ENABLE_FLAGOS_PERMANENTLY", "false").lower() in ("1", "true", "yes")
+    if enable_flagos_permanently and not is_flag_gems_global_enabled:
+        flag_gems.enable(record=True, once=True)
+        is_flag_gems_global_enabled = True
+    
+    # Use nullcontext if flag_gems is already globally enabled, otherwise use use_gems() context
+    context = nullcontext() if is_flag_gems_global_enabled else flag_gems.use_gems()
+    
+    return context

From 3d80e63679d97d2a382a0c13ea70da837d9d81e8 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Mon, 12 Jan 2026 17:16:53 +0800
Subject: [PATCH 23/59] Unify the usage of the gems context (#20)

Unify the usage of the gems context
- only enter or exit the context when switching between the flagos
backend and the torch backend (or vice versa).
- avoids the overhead of repeated enter/exit calls across multiple OPs.
---
 .../plugin/core/backend_switch.py             | 34 +++++++
 .../dot_product_attention/backends.py         | 90 +++++++++----------
 .../core/backends/flagos/impl/fused_adam.py   | 80 ++++++++---------
 .../plugin/core/backends/flagos/impl/gemm.py  | 74 ++++++++-------
 .../core/backends/flagos/impl/multi_tensor.py | 23 +++--
 .../core/backends/flagos/impl/rmsnorm.py      | 61 ++++++-------
 .../plugin/core/backends/flagos/utils.py      | 27 ------
 transformer_engine/plugin/core/manager.py     |  7 ++
 transformer_engine/plugin/core/ops.py         | 13 +++
 9 files changed, 212 insertions(+), 197 deletions(-)
 create mode 100644 transformer_engine/plugin/core/backend_switch.py
 delete mode 100644 transformer_engine/plugin/core/backends/flagos/utils.py

diff --git a/transformer_engine/plugin/core/backend_switch.py b/transformer_engine/plugin/core/backend_switch.py
new file mode 100644
index 0000000000..3ed9c5cae1
--- /dev/null
+++ b/transformer_engine/plugin/core/backend_switch.py
@@ -0,0 +1,34 @@
+import flag_gems
+from .types import BackendImplKind
+
+_flag_gems_context = None
+_flag_gems_context_entered = False
+
+def backend_context_switch(cur_backend):
+    """
+    Switch backend context based on the current backend.
+    """
+    global _flag_gems_context, _flag_gems_context_entered
+    assert cur_backend is not None, "Current Backend name cannot be None"
+    
+    if cur_backend == BackendImplKind.VENDOR:
+        return
+
+    # check if flagos should be enabled permanently via environment variable
+    flag_gems_global_registrar = getattr(flag_gems, 'current_work_registrar', None)
+    is_flag_gems_enabled = flag_gems_global_registrar is not None
+
+    # if flagos is enabled permanently, and flagos context is not entered, skip entering flagos context
+    if is_flag_gems_enabled and not _flag_gems_context_entered:
+        return
+
+    if cur_backend == BackendImplKind.DEFAULT and not _flag_gems_context_entered:
+        _flag_gems_context = flag_gems.use_gems()
+        _flag_gems_context.__enter__()
+        _flag_gems_context_entered = True
+        return
+
+    if cur_backend == BackendImplKind.REFERENCE and _flag_gems_context_entered:
+        _flag_gems_context.__exit__(None, None, None)
+        _flag_gems_context_entered = False
+        return
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index dbed0dc2cf..30596435db 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -35,7 +35,6 @@
 
 import flag_gems
 
-from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 class AttnFuncFL(torch.autograd.Function):
     @staticmethod
@@ -73,25 +72,24 @@ def forward(
 
         is_causal = attn_mask_type == 'causal'
 
-        with gems_context():
-            # FlagGems requires contiguous tensors, so we must call contiguous() after permute
-            q_permuted = q.permute(1, 2, 0, 3).contiguous()
-            k_permuted = k.permute(1, 2, 0, 3).contiguous()
-            v_permuted = v.permute(1, 2, 0, 3).contiguous()
 
-            (out_permuted, m) = flag_gems.scaled_dot_product_attention_forward(
-                q_permuted,
-                k_permuted,
-                v_permuted,
-                attn_mask=None,
-                dropout_p=dropout_p,
-                is_causal=is_causal,
-                scale=attn_scale,
-                enable_gqa=True,
-            )
+        q_permuted = q.permute(1, 2, 0, 3).contiguous()
+        k_permuted = k.permute(1, 2, 0, 3).contiguous()
+        v_permuted = v.permute(1, 2, 0, 3).contiguous()
+
+        (out_permuted, m) = flag_gems.scaled_dot_product_attention_forward(
+            q_permuted,
+            k_permuted,
+            v_permuted,
+            attn_mask=None,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=attn_scale,
+            enable_gqa=True,
+        )
+        # Must be contiguous for .view() in FlashAttentionFL.forward
+        out = out_permuted.permute(2, 0, 1, 3).contiguous()
 
-            # Must be contiguous for .view() in FlashAttentionFL.forward
-            out = out_permuted.permute(2, 0, 1, 3).contiguous()
         aux_ctx_tensors = [out_permuted, m]
         out_ret = out
         qkvo_tensors = (q_permuted, k_permuted, v_permuted, out_permuted)
@@ -162,34 +160,34 @@ def backward(ctx, d_out, *_args):
 
             dqkv_te_dtype = TE_DType[d_out.dtype]
 
-            with gems_context():
-                # Ensure all tensors are contiguous for FlagGems backward
-                q_permuted = q_permuted.contiguous() if not q_permuted.is_contiguous() else q_permuted
-                k_permuted = k_permuted.contiguous() if not k_permuted.is_contiguous() else k_permuted
-                v_permuted = v_permuted.contiguous() if not v_permuted.is_contiguous() else v_permuted
-                out_permuted = out_permuted.contiguous() if not out_permuted.is_contiguous() else out_permuted
-                m = m.contiguous() if not m.is_contiguous() else m
-
-                # d_out is (seq, batch, heads, dim) from autograd, permute to (batch, heads, seq, dim)
-                d_out_permuted = d_out.permute(1, 2, 0, 3).contiguous()
-
-                dq_permuted, dk_permuted, dv_permuted = flag_gems.scaled_dot_product_attention_backward(
-                    d_out_permuted,
-                    q_permuted,
-                    k_permuted,
-                    v_permuted,
-                    out_permuted,
-                    m,
-                    attn_mask=None,
-                    dropout_p=ctx.dropout_p,
-                    is_causal=ctx.is_causal,
-                    scale=ctx.attn_scale,
-                    enable_gqa=True,
-                )
-
-                dq = dq_permuted.permute(2, 0, 1, 3)
-                dk = dk_permuted.permute(2, 0, 1, 3)
-                dv = dv_permuted.permute(2, 0, 1, 3)
+
+            q_permuted = q_permuted.contiguous() if not q_permuted.is_contiguous() else q_permuted
+            k_permuted = k_permuted.contiguous() if not k_permuted.is_contiguous() else k_permuted
+            v_permuted = v_permuted.contiguous() if not v_permuted.is_contiguous() else v_permuted
+            out_permuted = out_permuted.contiguous() if not out_permuted.is_contiguous() else out_permuted
+            m = m.contiguous() if not m.is_contiguous() else m
+
+            # d_out is (seq, batch, heads, dim) from autograd, permute to (batch, heads, seq, dim)
+            d_out_permuted = d_out.permute(1, 2, 0, 3).contiguous()
+
+            dq_permuted, dk_permuted, dv_permuted = flag_gems.scaled_dot_product_attention_backward(
+                d_out_permuted,
+                q_permuted,
+                k_permuted,
+                v_permuted,
+                out_permuted,
+                m,
+                attn_mask=None,
+                dropout_p=ctx.dropout_p,
+                is_causal=ctx.is_causal,
+                scale=ctx.attn_scale,
+                enable_gqa=True,
+            )
+
+            dq = dq_permuted.permute(2, 0, 1, 3)
+            dk = dk_permuted.permute(2, 0, 1, 3)
+            dv = dv_permuted.permute(2, 0, 1, 3)
+
             rest = None
 
         return (
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index 867ee1a101..bd63f75e67 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -5,7 +5,6 @@
 from typing import Optional, List
 import torch
 import flag_gems
-from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 def multi_tensor_adam_fl(
     chunk_size: int,
@@ -23,56 +22,55 @@ def multi_tensor_adam_fl(
     out_dtype: Optional[torch.dtype] = None,
 ) -> None:
 
-    with gems_context():
-        num_lists = len(tensor_lists)
-        assert num_lists in [4, 5], f"Expected 4 or 5 tensor lists, got {num_lists}"
+    num_lists = len(tensor_lists)
+    assert num_lists in [4, 5], f"Expected 4 or 5 tensor lists, got {num_lists}"
 
-        num_tensors = len(tensor_lists[0])
-        assert num_tensors > 0, "No tensors provided"
+    num_tensors = len(tensor_lists[0])
+    assert num_tensors > 0, "No tensors provided"
 
-        for i, lst in enumerate(tensor_lists):
-            assert len(lst) == num_tensors, f"List {i} has {len(lst)} tensors, expected {num_tensors}"
+    for i, lst in enumerate(tensor_lists):
+        assert len(lst) == num_tensors, f"List {i} has {len(lst)} tensors, expected {num_tensors}"
 
-        bias_correction1 = 1.0
-        bias_correction2 = 1.0
-        if bias_correction == 1:
-            bias_correction1 = 1 - beta1 ** step
-            bias_correction2 = 1 - beta2 ** step
+    bias_correction1 = 1.0
+    bias_correction2 = 1.0
+    if bias_correction == 1:
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
 
-        is_adamw = (mode == 1)
+    is_adamw = (mode == 1)
 
-        for i in range(num_tensors):
-            g = tensor_lists[0][i]
-            p = tensor_lists[1][i]
-            m = tensor_lists[2][i]
-            v = tensor_lists[3][i]
-            p_master = tensor_lists[4][i] if num_lists == 5 else None
+    for i in range(num_tensors):
+        g = tensor_lists[0][i]
+        p = tensor_lists[1][i]
+        m = tensor_lists[2][i]
+        v = tensor_lists[3][i]
+        p_master = tensor_lists[4][i] if num_lists == 5 else None
 
-            if not g.is_contiguous():
-                g = g.contiguous()
+        if not g.is_contiguous():
+            g = g.contiguous()
 
-            if inv_scale is not None and inv_scale != 1.0:
-                g = g * inv_scale
+        if inv_scale is not None and inv_scale != 1.0:
+            g = g * inv_scale
 
-            m.mul_(beta1).add_(g, alpha=1 - beta1)
-            v.mul_(beta2).add_(g.mul(g).mul_(1 - beta2))
+        m.mul_(beta1).add_(g, alpha=1 - beta1)
+        v.mul_(beta2).add_(g.mul(g).mul_(1 - beta2))
 
-            m_corr = m.clone()
-            v_corr = v.clone()
-            if bias_correction == 1:
-                m_corr = m_corr / bias_correction1
-                v_corr = v_corr / bias_correction2
+        m_corr = m.clone()
+        v_corr = v.clone()
+        if bias_correction == 1:
+            m_corr = m_corr / bias_correction1
+            v_corr = v_corr / bias_correction2
 
-            update = m_corr / (v_corr.sqrt() + eps)
+        update = m_corr / (v_corr.sqrt() + eps)
 
-            if is_adamw:
-                p.data.mul_(1 - lr * weight_decay)
-            else:
-                update.add_(p, alpha=weight_decay)
+        if is_adamw:
+            p.data.mul_(1 - lr * weight_decay)
+        else:
+            update.add_(p, alpha=weight_decay)
 
-            p.data.add_(update, alpha=-lr)
+        p.data.add_(update, alpha=-lr)
 
-            if p_master is not None:
-                p_master.data.copy_(p.data)
-                out_dtype = p_master.dtype if out_dtype is None else out_dtype
-                p.data = p.data.to(out_dtype)
+        if p_master is not None:
+            p_master.data.copy_(p.data)
+            out_dtype = p_master.dtype if out_dtype is None else out_dtype
+            p.data = p.data.to(out_dtype)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
index 57f40bfffc..4d22b88d68 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
@@ -6,7 +6,6 @@
 import torch
 
 import flag_gems
-from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 __all__ = [
     "generic_gemm_fl",
@@ -65,51 +64,50 @@ def generic_gemm_fl(
     beta: Optional[float] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
 
-    with gems_context():
-        assert not gelu and gelu_in is None, "Triton-Based General Gemm do not support gelu now"
-        assert quantizer is None, "Triton-Based General Gemm do not support quantization now"
-        assert bias is None, "Triton-Based General Gemm do not support bias now"
+    assert not gelu and gelu_in is None, "Triton-Based General Gemm do not support gelu now"
+    assert quantizer is None, "Triton-Based General Gemm do not support quantization now"
+    assert bias is None, "Triton-Based General Gemm do not support bias now"
 
-        alpha = validate_gemm_scale(alpha, True)
-        beta = validate_gemm_scale(beta, accumulate)
+    alpha = validate_gemm_scale(alpha, True)
+    beta = validate_gemm_scale(beta, accumulate)
 
-        s = -1
-        b = -1
-        orig_A_shape = A.shape
-        orig_B_shape = B.shape
-        shape_a_changed = False
-        shape_b_changed = False
+    s = -1
+    b = -1
+    orig_A_shape = A.shape
+    orig_B_shape = B.shape
+    shape_a_changed = False
+    shape_b_changed = False
 
-        if A.ndim == 3:
-            A = A.view(-1, A.shape[-1])
-            shape_a_changed = True
+    if A.ndim == 3:
+        A = A.view(-1, A.shape[-1])
+        shape_a_changed = True
 
-        if B.ndim == 3:
-            s, b, _ = B.shape
-            B = B.view(-1, B.shape[-1])
-            shape_b_changed = True
+    if B.ndim == 3:
+        s, b, _ = B.shape
+        B = B.view(-1, B.shape[-1])
+        shape_b_changed = True
 
-        A_comp = A.T if transA else A
-        B_comp = B.T if transB else B
+    A_comp = A.T if transA else A
+    B_comp = B.T if transB else B
 
-        out1 = flag_gems.mm(B_comp, A_comp)
+    out1 = flag_gems.mm(B_comp, A_comp)
 
-        if shape_b_changed:
-            out1 = out1.view(s, b, -1)
+    if shape_b_changed:
+        out1 = out1.view(s, b, -1)
 
-        torch_out_dtype = _convert_dtype(output_dtype)
-        if torch_out_dtype is not None and out1.dtype != torch_out_dtype:
-            out1 = out1.to(torch_out_dtype)
+    torch_out_dtype = _convert_dtype(output_dtype)
+    if torch_out_dtype is not None and out1.dtype != torch_out_dtype:
+        out1 = out1.to(torch_out_dtype)
 
-        bias_grad = None
-        gelu_input = None
-        extra_output_ret = None
+    bias_grad = None
+    gelu_input = None
+    extra_output_ret = None
 
-        if D is not None:
-            if accumulate:
-                D.add_(out1)
-            else:
-                D.copy_(out1)
-            return D, bias_grad, gelu_input, extra_output_ret
+    if D is not None:
+        if accumulate:
+            D.add_(out1)
         else:
-            return out1, bias_grad, gelu_input, extra_output_ret
+            D.copy_(out1)
+        return D, bias_grad, gelu_input, extra_output_ret
+    else:
+        return out1, bias_grad, gelu_input, extra_output_ret
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
index 4f7e6e907b..5a81b02dd2 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -6,24 +6,21 @@
 from torch.distributed._tensor import DTensor
 import flag_gems
 
-from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
 
-    with gems_context():
-        tensors = tensor_lists[0]
+    tensors = tensor_lists[0]
 
-        if per_tensor:
-            norms = [torch.norm(t.float(), p=2) for t in tensors]
-            return norms, None
-        else:
-            total_norm_sq = sum(torch.sum(t.float() ** 2) for t in tensors)
-            total_norm = torch.sqrt(total_norm_sq)
-            return total_norm, None
+    if per_tensor:
+        norms = [torch.norm(t.float(), p=2) for t in tensors]
+        return norms, None
+    else:
+        total_norm_sq = sum(torch.sum(t.float() ** 2) for t in tensors)
+        total_norm = torch.sqrt(total_norm_sq)
+        return total_norm, None
 
 
 def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
 
-    with gems_context():
-        for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-            dst.copy_(src * scale)
+    for src, dst in zip(tensor_lists[0], tensor_lists[1]):
+        dst.copy_(src * scale)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
index a4358c3d7e..92366adc1f 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
@@ -5,7 +5,6 @@
 import torch
 import flag_gems
 
-from transformer_engine.plugin.core.backends.flagos.utils import gems_context
 
 def rmsnorm_fwd_fl(
     input,
@@ -17,23 +16,22 @@ def rmsnorm_fwd_fl(
     sm_margin,
     zero_centered_gamma,
 ):
-    with gems_context():
-        if zero_centered_gamma:
-            weight_adj = 1 + weight
-        else:
-            weight_adj = weight
+    if zero_centered_gamma:
+        weight_adj = 1 + weight
+    else:
+        weight_adj = weight
 
-        y, rstdevs = flag_gems.rms_norm_forward(
-            input,
-            [input.shape[-1]],
-            weight_adj,
-            eps,
-        )
+    y, rstdevs = flag_gems.rms_norm_forward(
+        input,
+        [input.shape[-1]],
+        weight_adj,
+        eps,
+    )
 
-        if rstdevs.shape != input.shape[:-1]:
-            rstdevs = rstdevs.view(input.shape[:-1])
+    if rstdevs.shape != input.shape[:-1]:
+        rstdevs = rstdevs.view(input.shape[:-1])
 
-        return y, None, rstdevs
+    return y, None, rstdevs
 
 
 def rmsnorm_bwd_fl(
@@ -45,20 +43,19 @@ def rmsnorm_bwd_fl(
     zero_centered_gamma,
     eps,
 ):
-    with gems_context():
-        # When zero_centered_gamma is True, forward uses (1 + gamma) as weight
-        # So backward needs to use (1 + gamma) for computing dx
-        if zero_centered_gamma:
-            gamma_adj = 1 + gamma
-        else:
-            gamma_adj = gamma
-
-        dx, dw = flag_gems.rms_norm_backward(
-            dy,
-            x,
-            rsigma,
-            [x.shape[-1]],
-            gamma_adj,
-            eps,
-        )
-        return dx, dw
+    # When zero_centered_gamma is True, forward uses (1 + gamma) as weight
+    # So backward needs to use (1 + gamma) for computing dx
+    if zero_centered_gamma:
+        gamma_adj = 1 + gamma
+    else:
+        gamma_adj = gamma
+
+    dx, dw = flag_gems.rms_norm_backward(
+        dy,
+        x,
+        rsigma,
+        [x.shape[-1]],
+        gamma_adj,
+        eps,
+    )
+    return dx, dw
diff --git a/transformer_engine/plugin/core/backends/flagos/utils.py b/transformer_engine/plugin/core/backends/flagos/utils.py
deleted file mode 100644
index cb0547c190..0000000000
--- a/transformer_engine/plugin/core/backends/flagos/utils.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import os
-from contextlib import nullcontext
-
-
-def gems_context():
-    # check if flagos should be enabled permanently via environment variable
-    flag_gems_global_registrar = None
-    try:
-        import flag_gems
-        flag_gems_global_registrar = getattr(flag_gems, 'current_work_registrar', None)
-    except Exception as e:
-        from ...logger_manager import get_logger
-        logger = get_logger()
-        logger.warning(f"Failed to get flag gems registrar: {e}")
-        
-    is_flag_gems_global_enabled = flag_gems_global_registrar is not None
-
-    # Check if flagos should be enabled permanently via environment variable
-    enable_flagos_permanently = os.getenv("TE_FL_ENABLE_FLAGOS_PERMANENTLY", "false").lower() in ("1", "true", "yes")
-    if enable_flagos_permanently and not is_flag_gems_global_enabled:
-        flag_gems.enable(record=True, once=True)
-        is_flag_gems_global_enabled = True
-    
-    # Use nullcontext if flag_gems is already globally enabled, otherwise use use_gems() context
-    context = nullcontext() if is_flag_gems_global_enabled else flag_gems.use_gems()
-    
-    return context
diff --git a/transformer_engine/plugin/core/manager.py b/transformer_engine/plugin/core/manager.py
index cd96b35bb0..3f6bbc1cff 100644
--- a/transformer_engine/plugin/core/manager.py
+++ b/transformer_engine/plugin/core/manager.py
@@ -17,6 +17,7 @@
 
 logger = get_logger()
 
+from .backend_switch import backend_context_switch
 
 @dataclass
 class _OpManagerState:
@@ -354,6 +355,9 @@ def call(self, op_name: str, *args, **kwargs):
             snap = self._registry.snapshot()
             for impl in snap.impls_by_op.get(op_name, []):
                 if impl.impl_id == impl_id:
+                    # control context switch for different backends for every op impl call
+                    backend_context_switch(impl.kind)
+
                     # Only log if first time or implementation actually changed
                     if last_impl_id is None:
                         logger.info_once(
@@ -378,6 +382,9 @@ def call(self, op_name: str, *args, **kwargs):
 
         for idx, impl in enumerate(candidates):
             try:
+                # control context switch for different backends for every op impl call
+                backend_context_switch(impl.kind)
+
                 result = impl.fn(*args, **kwargs)
 
                 # Log on success
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 50ed6d72a4..c1d067537f 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -13,6 +13,8 @@
 from .logger_manager import get_logger
 logger = get_logger()
 
+from .backend_switch import backend_context_switch
+
 class DType(IntEnum):
     kByte = 0
     kInt32 = 2
@@ -1174,6 +1176,9 @@ def forward(
 
                 for impl in snap.impls_by_op.get(layer_key, []):
                     if impl.impl_id == class_name_lower or class_name_lower.startswith(impl.impl_id):
+                        # control context switch for different backends for every op impl call
+                        backend_context_switch(impl.kind)
+
                         impl_id = impl.impl_id
                         break
 
@@ -1262,7 +1267,11 @@ def forward(
             try:
                 # Check if this impl creates our current class
                 impl_class = impl.fn()
+
                 if impl_class == current_class:
+                    # control context switch for different backends for every op impl call
+                    backend_context_switch(impl.kind)
+
                     current_impl_id = impl.impl_id
                     break
             except:
@@ -1320,6 +1329,10 @@ def forward(
             try:
                 # All attempts here are fallbacks (since we skipped current impl)
                 # Get fallback class and create instance
+
+                # control context switch for different backends for every op impl call
+                backend_context_switch(impl.kind)
+
                 fallback_class = impl.fn()
                 fallback_instance = fallback_class(**self._init_params)
                 # Set manager for nested fallback support

From f101d2c4053a71a88ad9959e22ae924c545827db Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Mon, 12 Jan 2026 17:17:10 +0800
Subject: [PATCH 24/59] fix: torch SDPA backend multi-batch support (#17)

## Summary
- Support combined qkv_layout formats like `sbhd_sbhd_sbhd` by
extracting the first part for layout conversion
- Distinguish between standard 4D tensor format (sbhd/bshd) and true
packed format (thd). For 4D tensors, directly convert layout like flagos
backend does, instead of incorrectly trying to unpack

## Problem
When using torch SDPA backend with `batch_size > 1`, the following error
occurs:
```
ValueError: Unexpected 4D tensor shape torch.Size([4096, 4, 16, 128]). Expected [total_tokens, 1, num_heads, head_dim]
```

The original code incorrectly tried to unpack 4D tensors when
`cu_seqlens` was provided, but 4D tensors in `sbhd`/`bshd` format should
be handled with simple layout conversion (like flagos backend does).

## Test plan
- [x] Tested with batch_size=4, verified no ValueError
- [x] Results match flagos backend output
---
 .../backends/reference/flash_attention.py     | 63 ++++++++++++++-----
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/reference/flash_attention.py b/transformer_engine/plugin/core/backends/reference/flash_attention.py
index 833cde97d6..60e4bc1bb9 100644
--- a/transformer_engine/plugin/core/backends/reference/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/reference/flash_attention.py
@@ -42,12 +42,19 @@ def _convert_layout_to_bhsd(
         """Convert tensor from various layouts to [batch, heads, seq, dim] format."""
         layout = layout.lower()
 
+        # Handle combined layouts like "sbhd_sbhd_sbhd" - extract the first part
+        if "_" in layout:
+            layout = layout.split("_")[0]
+
         if layout in ("sbhd", "sbh3d", "sb3hd"):
             return tensor.permute(1, 2, 0, 3)
         elif layout in ("bshd", "bsh3d", "bs3hd"):
             return tensor.permute(0, 2, 1, 3)
-        elif layout == "bhsd":
+        elif layout in ("bhsd",):
             return tensor
+        elif layout in ("thd",):
+            # thd is packed format, should not reach here for 4D tensors
+            raise ValueError(f"thd layout requires 3D tensor, got {tensor.dim()}D")
         else:
             raise ValueError(f"Unsupported qkv_layout: {layout}")
 
@@ -59,12 +66,18 @@ def _convert_bhsd_to_layout(
         """Convert tensor from [batch, heads, seq, dim] back to original layout."""
         layout = layout.lower()
 
+        # Handle combined layouts like "sbhd_sbhd_sbhd" - extract the first part
+        if "_" in layout:
+            layout = layout.split("_")[0]
+
         if layout in ("sbhd", "sbh3d", "sb3hd"):
             return tensor.permute(2, 0, 1, 3)
         elif layout in ("bshd", "bsh3d", "bs3hd"):
             return tensor.permute(0, 2, 1, 3)
-        elif layout == "bhsd":
+        elif layout in ("bhsd",):
             return tensor
+        elif layout in ("thd",):
+            raise ValueError(f"thd layout requires 3D tensor, got {tensor.dim()}D")
         else:
             raise ValueError(f"Unsupported qkv_layout: {layout}")
 
@@ -209,27 +222,43 @@ def _forward_impl(
         if alibi_slopes is not None:
             raise NotImplementedError("ALiBi slopes are not supported in PyTorch SDPA backend")
 
-        use_packed_format = cu_seqlens_q is not None or cu_seqlens_kv is not None
-        padding_mask_q = None
-        padding_mask_kv = None
         query_original_shape = query_layer.shape
 
-        if use_packed_format:
-            if cu_seqlens_q is not None:
-                query, padding_mask_q = self._unpack_tensor(query_layer, cu_seqlens_q, max_seqlen_q)
-            else:
-                query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+        # Check if input is in standard 4D format - same as flagos backend
+        # If tensor is 4D, treat it as standard format and just do layout conversion
+        # Only use unpack logic for true packed format (3D tensors with thd layout)
+        is_standard_4d = query_layer.dim() == 4
 
-            if cu_seqlens_kv is not None:
-                key, padding_mask_kv = self._unpack_tensor(key_layer, cu_seqlens_kv, max_seqlen_kv)
-                value, _ = self._unpack_tensor(value_layer, cu_seqlens_kv, max_seqlen_kv)
-            else:
-                key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
-                value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
-        else:
+        if is_standard_4d:
+            # Standard 4D tensor format - just convert layout like flagos does
             query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
             key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
             value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+            use_packed_format = False
+            padding_mask_q = None
+            padding_mask_kv = None
+        else:
+            # True packed format (thd layout, 3D tensor) - use unpack logic
+            use_packed_format = cu_seqlens_q is not None or cu_seqlens_kv is not None
+            padding_mask_q = None
+            padding_mask_kv = None
+
+            if use_packed_format:
+                if cu_seqlens_q is not None:
+                    query, padding_mask_q = self._unpack_tensor(query_layer, cu_seqlens_q, max_seqlen_q)
+                else:
+                    query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+
+                if cu_seqlens_kv is not None:
+                    key, padding_mask_kv = self._unpack_tensor(key_layer, cu_seqlens_kv, max_seqlen_kv)
+                    value, _ = self._unpack_tensor(value_layer, cu_seqlens_kv, max_seqlen_kv)
+                else:
+                    key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+                    value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+            else:
+                query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+                key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+                value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
 
         batch_size, num_heads_q, seq_len_q, head_dim = query.shape
         num_heads_kv = key.shape[1]

From 832a7976b273a35e0db6e3f75fae8e4c5fab5bec Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Tue, 13 Jan 2026 20:42:28 +0800
Subject: [PATCH 25/59] Remove use_gems context and call flag_gems.xxx directly
  (#22)

- Remove the flag_gems.use_gems() context to avoid context-switching
overhead
- Call flag_gems.xxx directly wherever possible.
---
 .../plugin/core/backend_switch.py             | 34 -------------------
 .../core/backends/flagos/impl/fused_adam.py   | 21 ++++++------
 .../plugin/core/backends/flagos/impl/gemm.py  |  5 +--
 .../core/backends/flagos/impl/multi_tensor.py |  6 ++--
 .../core/backends/flagos/impl/rmsnorm.py      |  5 +--
 transformer_engine/plugin/core/manager.py     |  7 ----
 transformer_engine/plugin/core/ops.py         | 13 -------
 7 files changed, 20 insertions(+), 71 deletions(-)
 delete mode 100644 transformer_engine/plugin/core/backend_switch.py

diff --git a/transformer_engine/plugin/core/backend_switch.py b/transformer_engine/plugin/core/backend_switch.py
deleted file mode 100644
index 3ed9c5cae1..0000000000
--- a/transformer_engine/plugin/core/backend_switch.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import flag_gems
-from .types import BackendImplKind
-
-_flag_gems_context = None
-_flag_gems_context_entered = False
-
-def backend_context_switch(cur_backend):
-    """
-    Switch backend context based on the current backend.
-    """
-    global _flag_gems_context, _flag_gems_context_entered
-    assert cur_backend is not None, "Current Backend name cannot be None"
-    
-    if cur_backend == BackendImplKind.VENDOR:
-        return
-
-    # check if flagos should be enabled permanently via environment variable
-    flag_gems_global_registrar = getattr(flag_gems, 'current_work_registrar', None)
-    is_flag_gems_enabled = flag_gems_global_registrar is not None
-
-    # if flagos is enabled permanently, and flagos context is not entered, skip entering flagos context
-    if is_flag_gems_enabled and not _flag_gems_context_entered:
-        return
-
-    if cur_backend == BackendImplKind.DEFAULT and not _flag_gems_context_entered:
-        _flag_gems_context = flag_gems.use_gems()
-        _flag_gems_context.__enter__()
-        _flag_gems_context_entered = True
-        return
-
-    if cur_backend == BackendImplKind.REFERENCE and _flag_gems_context_entered:
-        _flag_gems_context.__exit__(None, None, None)
-        _flag_gems_context_entered = False
-        return
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index bd63f75e67..bd4b916010 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -6,6 +6,7 @@
 import torch
 import flag_gems
 
+
 def multi_tensor_adam_fl(
     chunk_size: int,
     noop_flag: torch.Tensor,
@@ -50,27 +51,27 @@ def multi_tensor_adam_fl(
             g = g.contiguous()
 
         if inv_scale is not None and inv_scale != 1.0:
-            g = g * inv_scale
+            g = flag_gems.mul(g, inv_scale)
 
-        m.mul_(beta1).add_(g, alpha=1 - beta1)
-        v.mul_(beta2).add_(g.mul(g).mul_(1 - beta2))
+        m = flag_gems.add_(flag_gems.mul_(m, beta1), g, alpha=1-beta1)
+        v = flag_gems.add_(flag_gems.mul_(v, beta2), flag_gems.mul_(flag_gems.mul_(g, g), 1 - beta2))
 
         m_corr = m.clone()
         v_corr = v.clone()
         if bias_correction == 1:
-            m_corr = m_corr / bias_correction1
-            v_corr = v_corr / bias_correction2
+            m_corr = flag_gems.true_divide(m_corr, bias_correction1)
+            v_corr = flag_gems.true_divide(v_corr, bias_correction2)
 
-        update = m_corr / (v_corr.sqrt() + eps)
+        update = flag_gems.true_divide(m_corr, flag_gems.add(flag_gems.sqrt(v_corr), eps))
 
         if is_adamw:
-            p.data.mul_(1 - lr * weight_decay)
+            p = flag_gems.mul_(p, 1 - lr * weight_decay)
         else:
-            update.add_(p, alpha=weight_decay)
+            update = flag_gems.add_(update, p, alpha=weight_decay)
 
-        p.data.add_(update, alpha=-lr)
+        p = flag_gems.add_(p, update, alpha=-lr)
 
         if p_master is not None:
-            p_master.data.copy_(p.data)
+            flag_gems.copy_(p_master, p)
             out_dtype = p_master.dtype if out_dtype is None else out_dtype
             p.data = p.data.to(out_dtype)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
index 4d22b88d68..709c107a57 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
@@ -7,6 +7,7 @@
 
 import flag_gems
 
+
 __all__ = [
     "generic_gemm_fl",
 ]
@@ -105,9 +106,9 @@ def generic_gemm_fl(
 
     if D is not None:
         if accumulate:
-            D.add_(out1)
+            flag_gems.add_(D, out1)
         else:
-            D.copy_(out1)
+            flag_gems.copy_(D, out1)
         return D, bias_grad, gelu_input, extra_output_ret
     else:
         return out1, bias_grad, gelu_input, extra_output_ret
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
index 5a81b02dd2..4421487ff1 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -15,12 +15,12 @@ def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *ar
         norms = [torch.norm(t.float(), p=2) for t in tensors]
         return norms, None
     else:
-        total_norm_sq = sum(torch.sum(t.float() ** 2) for t in tensors)
-        total_norm = torch.sqrt(total_norm_sq)
+        total_norm_sq = sum(flag_gems.sum(flag_gems.pow_func(t.float(), 2)) for t in tensors)
+        total_norm = flag_gems.sqrt(total_norm_sq)
         return total_norm, None
 
 
 def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
 
     for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-        dst.copy_(src * scale)
+        flag_gems.copy_(dst, src * scale)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
index 92366adc1f..ffa382147f 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
@@ -17,7 +17,8 @@ def rmsnorm_fwd_fl(
     zero_centered_gamma,
 ):
     if zero_centered_gamma:
-        weight_adj = 1 + weight
+        # weight_adj = 1 + weight
+        weight_adj = flag_gems.add(1, weight)
     else:
         weight_adj = weight
 
@@ -46,7 +47,7 @@ def rmsnorm_bwd_fl(
     # When zero_centered_gamma is True, forward uses (1 + gamma) as weight
     # So backward needs to use (1 + gamma) for computing dx
     if zero_centered_gamma:
-        gamma_adj = 1 + gamma
+        gamma_adj = flag_gems.add(1, gamma)
     else:
         gamma_adj = gamma
 
diff --git a/transformer_engine/plugin/core/manager.py b/transformer_engine/plugin/core/manager.py
index 3f6bbc1cff..cd96b35bb0 100644
--- a/transformer_engine/plugin/core/manager.py
+++ b/transformer_engine/plugin/core/manager.py
@@ -17,7 +17,6 @@
 
 logger = get_logger()
 
-from .backend_switch import backend_context_switch
 
 @dataclass
 class _OpManagerState:
@@ -355,9 +354,6 @@ def call(self, op_name: str, *args, **kwargs):
             snap = self._registry.snapshot()
             for impl in snap.impls_by_op.get(op_name, []):
                 if impl.impl_id == impl_id:
-                    # control context switch for different backends for every op impl call
-                    backend_context_switch(impl.kind)
-
                     # Only log if first time or implementation actually changed
                     if last_impl_id is None:
                         logger.info_once(
@@ -382,9 +378,6 @@ def call(self, op_name: str, *args, **kwargs):
 
         for idx, impl in enumerate(candidates):
             try:
-                # control context switch for different backends for every op impl call
-                backend_context_switch(impl.kind)
-
                 result = impl.fn(*args, **kwargs)
 
                 # Log on success
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index c1d067537f..50ed6d72a4 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -13,8 +13,6 @@
 from .logger_manager import get_logger
 logger = get_logger()
 
-from .backend_switch import backend_context_switch
-
 class DType(IntEnum):
     kByte = 0
     kInt32 = 2
@@ -1176,9 +1174,6 @@ def forward(
 
                 for impl in snap.impls_by_op.get(layer_key, []):
                     if impl.impl_id == class_name_lower or class_name_lower.startswith(impl.impl_id):
-                        # control context switch for different backends for every op impl call
-                        backend_context_switch(impl.kind)
-
                         impl_id = impl.impl_id
                         break
 
@@ -1267,11 +1262,7 @@ def forward(
             try:
                 # Check if this impl creates our current class
                 impl_class = impl.fn()
-
                 if impl_class == current_class:
-                    # control context switch for different backends for every op impl call
-                    backend_context_switch(impl.kind)
-
                     current_impl_id = impl.impl_id
                     break
             except:
@@ -1329,10 +1320,6 @@ def forward(
             try:
                 # All attempts here are fallbacks (since we skipped current impl)
                 # Get fallback class and create instance
-
-                # control context switch for different backends for every op impl call
-                backend_context_switch(impl.kind)
-
                 fallback_class = impl.fn()
                 fallback_instance = fallback_class(**self._init_params)
                 # Set manager for nested fallback support

From 08cabba6e28cd77f469a946b36074a1c3960c82f Mon Sep 17 00:00:00 2001
From: dinghaodhd <986165956@qq.com>
Date: Fri, 16 Jan 2026 10:59:45 +0800
Subject: [PATCH 26/59] Add new vendor backend METAX (#21)

# Description

Add the new vendor backend METAX

## Type of change

- [ ] New feature (non-breaking change which adds functionality)

## Changes

Please list the changes introduced in this PR:

-  Add metax ops register
-  Add metax backend implementation
-  Register metax ops in builtin_ops.py

## Requirements

- The module transformer_engine_torch_metax is needed, to use this
module, need to install package transformer_engine_metax

# Checklist:

- [x] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [x] The functionality is complete
- [x] I have commented my code, particularly in hard-to-understand areas
- [x] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
---
 .../core/backends/vendor/metax/__init__.py    |    7 +
 .../backends/vendor/metax/flash_attention.py  |  127 ++
 .../core/backends/vendor/metax/metax.py       | 1060 +++++++++++++++++
 .../backends/vendor/metax/register_ops.py     |  202 ++++
 transformer_engine/plugin/core/builtin_ops.py |    9 +
 5 files changed, 1405 insertions(+)
 create mode 100644 transformer_engine/plugin/core/backends/vendor/metax/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/metax/metax.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/metax/register_ops.py

diff --git a/transformer_engine/plugin/core/backends/vendor/metax/__init__.py b/transformer_engine/plugin/core/backends/vendor/metax/__init__.py
new file mode 100644
index 0000000000..f4e55f62e0
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/metax/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .metax import MetaxBackend
+
+__all__ = ["MetaxBackend"]
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
new file mode 100644
index 0000000000..14044cef6a
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+
+
+class FlashAttentionMETAX(FlashAttentionBase):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+        # Store initialization parameters for lazy loading
+        self._init_params = {
+            'softmax_scale': softmax_scale,
+            'attention_dropout': attention_dropout,
+            'attention_dropout_ctx': attention_dropout_ctx or nullcontext,
+            'attention_type': attention_type,
+            'layer_number': layer_number,
+            'deterministic': deterministic,
+        }
+        self._metax_flash_attn = None
+
+    def _ensure_metax_flash_attn(self):
+        """Lazy initialization of metax FlashAttention."""
+        if self._metax_flash_attn is not None:
+            return
+
+        try:
+            # Import here to avoid circular dependency issues
+            # transformer_engine_torch must be registered before this import
+            from transformer_engine_metax.pytorch.attention.dot_product_attention.backends import (
+                FlashAttention as FlashAttentionMetax,
+            )
+
+            if FlashAttentionMetax is None:
+                raise RuntimeError("FlashAttention class is None - flash-attn may not be installed correctly")
+
+            self._metax_flash_attn = FlashAttentionMetax(**self._init_params)
+
+        except ImportError as e:
+            raise RuntimeError(
+                f"Failed to import metax FlashAttention: {e}. "
+                "Please ensure flash-attn is installed and transformer_engine_torch is available."
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to initialize metax FlashAttention: {e}. "
+                f"Init params: {self._init_params}"
+            )
+
+    @property
+    def backend_name(self) -> str:
+        return "metax"
+
+    def _forward_impl(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        # Ensure metax flash attention is initialized
+        self._ensure_metax_flash_attn()
+
+        return self._metax_flash_attn(
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            qkv_layout=qkv_layout,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_kv=max_seqlen_kv,
+            attn_mask_type=attn_mask_type,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            cp_group=cp_group,
+            cp_global_ranks=cp_global_ranks,
+            cp_stream=cp_stream,
+            cp_comm_type=cp_comm_type,
+            fp8=fp8,
+            fp8_meta=fp8_meta,
+            quantizers=quantizers,
+            inference_params=inference_params,
+            flash_attention_backend=flash_attention_backend,
+            fp8_output=fp8_output,
+        )
+
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
new file mode 100644
index 0000000000..0baea24a2e
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -0,0 +1,1060 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import ctypes
+from pathlib import Path
+import importlib.util
+import platform
+import os
+import functools
+import inspect
+
+import torch
+
+from ....ops import TEFLBackendBase, FP8TensorMeta
+
+def _load_metax_libs():
+
+    def get_ext():
+        system = platform.system()
+        return ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
+
+    ext = get_ext()
+
+    try:
+        import transformer_engine_metax
+        te_path = Path(importlib.util.find_spec("transformer_engine_metax").origin).parent.parent
+        for search_dir in [te_path, te_path / "transformer_engine_metax"]:
+            if search_dir.exists():
+                matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
+                if matches:
+                    ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL)
+                    return True
+        return False
+    except Exception as e:
+        print(f"[Metax] Failed to load Metax libs: {e}")
+        return False
+
+_metax_libs_loaded = False
+
+def _ensure_metax_libs():
+    global _metax_libs_loaded
+    if not _metax_libs_loaded:
+        _metax_libs_loaded = _load_metax_libs()
+    return _metax_libs_loaded
+
+def _check_metax_available() -> bool:
+    if not torch.cuda.is_available():
+        return False
+
+    try:
+        from ...._build_config import SKIP_METAX_BUILD
+        if SKIP_METAX_BUILD:
+            print("[Metax] Disabled: Metax was skipped at build time")
+            return False
+    except ImportError:
+        if bool(int(os.environ.get("TE_FL_SKIP_METAX", "0"))):
+            print("[Metax] Disabled: TE_FL_SKIP_METAX=1")
+            return False
+
+    try:
+        if not _ensure_metax_libs():
+            return False
+        import transformer_engine_torch_metax
+        return True
+    except (ImportError, OSError) as e:
+        print(f"[Metax] Import failed: {e}")
+        return False
+
+def _get_tex():
+    _ensure_metax_libs()
+    import transformer_engine_torch_metax
+    return transformer_engine_torch_metax
+
+def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
+    if torch_dtype is None:
+        return None
+
+    NativeDType = tex_module.DType
+    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_torch_metax':
+        return torch_dtype
+
+    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
+        from transformer_engine.plugin.core.ops import DType as PyDType
+        if isinstance(torch_dtype, PyDType):
+            dtype_name = torch_dtype.name
+            if hasattr(NativeDType, dtype_name):
+                return getattr(NativeDType, dtype_name)
+
+    dtype_map = {
+        torch.float32: NativeDType.kFloat32,
+        torch.float16: NativeDType.kFloat16,
+        torch.bfloat16: NativeDType.kBFloat16,
+        torch.int32: NativeDType.kInt32,
+        torch.uint8: NativeDType.kByte,
+    }
+
+    if hasattr(torch, 'float8_e4m3fn'):
+        dtype_map[torch.float8_e4m3fn] = NativeDType.kFloat8E4M3
+    if hasattr(torch, 'float8_e5m2'):
+        dtype_map[torch.float8_e5m2] = NativeDType.kFloat8E5M2
+
+    return dtype_map.get(torch_dtype, torch_dtype)
+
+def _convert_dtype_params(func):
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        dtype_params = ['otype', 'output_dtype', 'bias_type']
+
+        from transformer_engine.plugin.core.ops import DType as PyDType
+
+        def needs_conversion(val):
+            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
+
+        for param_name in dtype_params:
+            if param_name in kwargs:
+                value = kwargs[param_name]
+                if needs_conversion(value):
+                    converted = self._to_te_dtype(value)
+                    kwargs[param_name] = converted
+
+        sig = inspect.signature(func)
+        param_names = list(sig.parameters.keys())[1:]
+
+        args_list = list(args)
+        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
+            if param_name in dtype_params and needs_conversion(arg_value):
+                converted = self._to_te_dtype(arg_value)
+                args_list[i] = converted
+
+        return func(self, *args_list, **kwargs)
+
+    return wrapper
+
+class MetaxBackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return _check_metax_available()
+
+    def __init__(self):
+        self._tex = None
+
+    def _get_tex(self):
+        if self._tex is None:
+            self._tex = _get_tex()
+        return self._tex
+
+    def _to_te_dtype(self, torch_dtype):
+        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
+
+    def is_available(self) -> bool:
+        return _check_metax_available()
+
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionMETAX
+        return FlashAttentionMETAX
+
+    def quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        output: Optional[torch.Tensor] = None,
+        noop: Optional[torch.Tensor] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.quantize(tensor, quantizer, output, noop)
+
+    @_convert_dtype_params
+    def dequantize(
+        self,
+        input: torch.Tensor,
+        otype: torch.dtype,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dequantize(input, otype)
+
+    def bgrad_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.bgrad_quantize(input, quantizer)
+
+    @_convert_dtype_params
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        tex = self._get_tex()
+
+        if bias_type is None:
+            bias_type = self._to_te_dtype(torch.bfloat16)
+
+        return tex.generic_gemm(
+            A, transA, B, transB, D, quantizer, output_dtype,
+            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
+            accumulate, use_split_accumulator, comm_overlap, comm_type,
+            extra_output, bulk_overlap, alpha, beta
+        )
+
+    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm(*args, **kwargs)
+
+    def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.gelu(input, quantizer)
+
+    def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.geglu(input, quantizer)
+    def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgelu(input, quantizer)
+
+    def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgeglu(input, quantizer)
+    def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.relu(input, quantizer)
+
+    def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.reglu(input, quantizer)
+    def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.srelu(input, quantizer)
+
+    def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.sreglu(input, quantizer)
+
+    def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.silu(input, quantizer)
+
+    def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.swiglu(input, quantizer)
+    def clamped_swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
+    def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgelu(grad, fwd_input, quantizer)
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgeglu(grad, fwd_input, quantizer)
+
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgelu(grad, fwd_input, quantizer)
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgeglu(grad, fwd_input, quantizer)
+
+    def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.drelu(grad, fwd_input, quantizer)
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dreglu(grad, fwd_input, quantizer)
+
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsrelu(grad, fwd_input, quantizer)
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsreglu(grad, fwd_input, quantizer)
+
+    def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsilu(grad, fwd_input, quantizer)
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dswiglu(grad, fwd_input, quantizer)
+
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_drelu(grad, fwd_input, quantizer)
+
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dqgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsrelu(grad, fwd_input, quantizer)
+
+    @_convert_dtype_params
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, mu, rsigma = tex.layernorm_fwd(
+            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+        return y, mu, rsigma
+
+    def layernorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dgamma, dbeta
+
+    @_convert_dtype_params
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, y_quant, rsigma = tex.rmsnorm_fwd(
+            input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+            if y_quant is not None:
+                y_quant = y_quant.view(*orig_shape)
+        return y, y_quant, rsigma
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dw
+
+    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.rmsnorm_bwd_add(*args, **kwargs)
+
+    def multi_tensor_quantize(
+        self,
+        tensor_list: List[torch.Tensor],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
+    def split_quantize(
+        self,
+        tensor: torch.Tensor,
+        split_sections: List[int],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.split_quantize(tensor, split_sections, quantizer_list)
+
+    def moe_permute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_permute_fwd(*args, **kwargs)
+
+    def moe_permute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_permute_bwd(*args, **kwargs)
+
+    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_unpermute_fwd(*args, **kwargs)
+
+    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.moe_unpermute_bwd(*args, **kwargs)
+
+    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+
+    def scaled_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale)
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
+
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
+
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        tex = self._get_tex()
+
+        args_list = list(args)
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+
+            if type(py_enum).__module__ == 'transformer_engine_torch_metax':
+                return py_enum
+
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+
+            if hasattr(py_enum, 'value'):
+                enum_value = int(py_enum.value)
+                for member_name in dir(native_enum_class):
+                    if not member_name.startswith('_'):
+                        try:
+                            member = getattr(native_enum_class, member_name)
+                            if hasattr(member, 'value') and int(member.value) == enum_value:
+                                return member
+                        except:
+                            pass
+
+            if hasattr(py_enum, 'value'):
+                return int(py_enum.value)
+
+            return py_enum
+
+        if len(args) > 1:
+            args_list[1] = self._to_te_dtype(args[1])
+        if len(args) > 2:
+            args_list[2] = self._to_te_dtype(args[2])
+        if len(args) > 3:
+            args_list[3] = convert_enum(args[3], tex.NVTE_QKV_Layout)
+        if len(args) > 4:
+            args_list[4] = convert_enum(args[4], tex.NVTE_Bias_Type)
+        if len(args) > 5:
+            args_list[5] = convert_enum(args[5], tex.NVTE_Mask_Type)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_Softmax_Type)
+
+        return tex.get_fused_attn_backend(*args_list, **kwargs)
+
+    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+            if type(py_enum).__module__ == 'transformer_engine_torch_metax':
+                return py_enum
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+            return py_enum
+
+        args_list = list(args)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_QKV_Layout)
+        if len(args) > 7:
+            args_list[7] = convert_enum(args[7], tex.NVTE_Bias_Type)
+        if len(args) > 8:
+            args_list[8] = convert_enum(args[8], tex.NVTE_Mask_Type)
+        if len(args) > 9:
+            args_list[9] = convert_enum(args[9], tex.NVTE_Softmax_Type)
+
+        return tex.fused_attn_fwd(*args_list, **kwargs)
+
+    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+            if type(py_enum).__module__ == 'transformer_engine_torch_metax':
+                return py_enum
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+            return py_enum
+
+        args_list = list(args)
+        if len(args) > 5:
+            args_list[5] = convert_enum(args[5], tex.NVTE_QKV_Layout)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_Bias_Type)
+        if len(args) > 7:
+            args_list[7] = convert_enum(args[7], tex.NVTE_Mask_Type)
+        if len(args) > 8:
+            args_list[8] = convert_enum(args[8], tex.NVTE_Softmax_Type)
+        if len(args) > 19:
+            args_list[19] = self._to_te_dtype(args[19])
+
+        if 'dqkv_dtype' in kwargs:
+            kwargs['dqkv_dtype'] = self._to_te_dtype(kwargs['dqkv_dtype'])
+
+        return tex.fused_attn_bwd(*args_list, **kwargs)
+
+    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_fwd(*args, **kwargs)
+
+    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_bwd(*args, **kwargs)
+
+    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.copy_to_kv_cache(*args, **kwargs)
+
+    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_thd_to_bshd(*args, **kwargs)
+
+    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_bshd_to_thd(*args, **kwargs)
+
+    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_rope_forward(*args, **kwargs)
+
+    def fused_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_rope_backward(*args, **kwargs)
+
+    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_forward(*args, **kwargs)
+
+    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_backward(*args, **kwargs)
+
+    def fused_topk_with_score_function_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        num_groups: int,
+        group_topk: int,
+        scaling_factor: float,
+        score_function: Any,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_fwd(
+            logits, topk, use_pre_softmax, num_groups, group_topk,
+            scaling_factor, score_function, expert_bias
+        )
+
+    def fused_topk_with_score_function_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        routing_map: torch.Tensor,
+        intermediate_output: torch.Tensor,
+        grad_probs: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        scaling_factor: float,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_bwd(
+            num_tokens, num_experts, routing_map, intermediate_output,
+            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+        )
+
+    def fused_score_for_moe_aux_loss_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
+
+    def fused_score_for_moe_aux_loss_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        intermediate_output: torch.Tensor,
+        grad_scores: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_bwd(
+            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+        )
+
+    def fused_moe_aux_loss_fwd(
+        self,
+        probs: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        total_num_tokens: int,
+        num_experts: int,
+        num_rows: int,
+        num_cols: int,
+        topk: int,
+        coeff: float,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_fwd(
+            probs, tokens_per_expert, total_num_tokens, num_experts,
+            num_rows, num_cols, topk, coeff
+        )
+
+    def fused_moe_aux_loss_bwd(
+        self,
+        Const_buf: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        num_rows: int,
+        num_cols: int,
+        grad_aux_loss: torch.Tensor,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
+
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.dropout_fwd(input, dropout_probability, out)
+
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
+
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: Any,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_transpose(input, dtype, out=out)
+
+    def swap_first_dims(
+        self,
+        tensor: torch.Tensor,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.swap_first_dims(tensor, out=out)
+
+    def compute_amax(
+        self,
+        input: torch.Tensor,
+        amax: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.compute_amax(input, amax)
+
+    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
+
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
+
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: Any,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
+
+    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_padding(*args, **kwargs)
+
+    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(*args, **kwargs)
+
+    def get_cublasLt_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cublasLt_version()
+
+    def get_cudnn_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cudnn_version()
+
+    def get_num_cublas_streams(self) -> int:
+        tex = self._get_tex()
+        return tex.get_num_cublas_streams()
+
+    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_half_tensor(*args, **kwargs)
+
+    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_second_half_lse_correction(*args, **kwargs)
+
+    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_second_half_lse(*args, **kwargs)
+
+    def thd_out_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_out_correction(*args, **kwargs)
+
+    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_grad_correction(*args, **kwargs)
+
+    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_get_partitioned_indices(*args, **kwargs)
+
+    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.init_nvshmem_backend(*args, **kwargs)
+
+    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.create_nvshmem_tensor(*args, **kwargs)
+
+    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_send_on_current_stream(*args, **kwargs)
+
+    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_wait_on_current_stream(*args, **kwargs)
+
+    def nvshmem_finalize(self) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_finalize()
+
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        tex = self._get_tex()
+        if chunk_size is None:
+            return tex.multi_tensor_adam
+        tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
+            eps, step, mode, bias_correction, weight_decay
+        )
+
+    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
+
+    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_fp8(*args, **kwargs)
+
+    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable(*args, **kwargs)
+
+    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
+
+    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_sgd(*args, **kwargs)
+
+    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+
+    def bulk_overlap_ag_with_external_gemm(
+        self,
+        allgather_communicator: Any,
+        send_stream: Any,
+        recv_stream: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        tex = self._get_tex()
+        return tex.FP8TensorMeta()
+
+    def create_comm_overlap_helper(
+        self,
+        world_group: Optional[Any] = None,
+        intra_node_group: Optional[Any] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        if world_group is None:
+            return tex.CommOverlapHelper()
+        return tex.CommOverlapHelper(world_group, intra_node_group)
+
+    def create_comm_overlap(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        num_splits: int = 3,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 2,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 16,
+        set_sm_margin: bool = True,
+        atomic_gemm: bool = False,
+        rs_overlap_first_gemm: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlap(
+            buffer_shape, buffer_dtype, helper, tp_size,
+            num_splits, num_max_streams, comm_cga_size,
+            gemm_priority, comm_priority, num_comm_sm,
+            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+        )
+
+    def create_comm_overlap_p2p(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        comm_type: Any,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 1,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 1,
+        set_sm_margin: bool = False,
+        atomic_gemm: bool = False,
+        use_ce: bool = True,
+        aggregate: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlapP2P(
+            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
+            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
+            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
new file mode 100644
index 0000000000..10ccc83c99
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Metax vendor backend operator registrations.
+
+This module registers all VENDOR (Metax) implementations from transformer_engine_torch.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ....types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all Metax (VENDOR) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    # Import Metax backend to get all the wrapped tex functions
+    from .metax import MetaxBackend
+
+    # Create a backend instance to access the methods
+    backend = MetaxBackend()
+
+    # Check if Metax is available before registering
+    if not backend.is_available():
+        return
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # Normalization
+        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="layernorm_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="layernorm_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="METAX", priority=100),
+
+        # GEMM
+        OpImpl(op_name="generic_gemm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="METAX", priority=100),
+
+        # Quantization
+        OpImpl(op_name="quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dequantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="bgrad_quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="split_quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="METAX", priority=100),
+
+        # Activations - Forward
+        OpImpl(op_name="gelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="geglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="qgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="qgeglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="relu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="reglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="srelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="sreglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="silu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="swiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="clamped_swiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="METAX", priority=100),
+
+        # Activations - Backward
+        OpImpl(op_name="dgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dgeglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dqgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dqgeglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="drelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dreglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dsrelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dsreglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dsilu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dswiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="METAX", priority=100),
+
+        # Activations - Bias + Backward
+        OpImpl(op_name="dbias_dgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dbias_dsilu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dbias_drelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="METAX", priority=100),
+
+        # Softmax
+        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="METAX", priority=100),
+
+        # MOE operations
+        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="METAX", priority=100),
+
+        # Fused attention
+        OpImpl(op_name="get_fused_attn_backend", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_attn_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_attn_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="METAX", priority=100),
+
+        # KV cache
+        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="METAX", priority=100),
+
+        # Tensor format conversions
+        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="METAX", priority=100),
+
+        # RoPE (Rotary Position Embedding)
+        OpImpl(op_name="fused_rope_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_rope_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="METAX", priority=100),
+
+        # TopK and MOE aux loss
+        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="METAX", priority=100),
+
+        # Dropout
+        OpImpl(op_name="dropout_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="dropout_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="METAX", priority=100),
+
+        # FP8 operations
+        OpImpl(op_name="fp8_transpose", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="swap_first_dims", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="compute_amax", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="METAX", priority=100),
+
+        # Padding operations
+        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="METAX", priority=100),
+
+        # Library version getters
+        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="get_cudnn_version", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="METAX", priority=100),
+
+        # THD (Tensor, Hidden, Dimension) operations
+        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="thd_out_correction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="thd_grad_correction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="METAX", priority=100),
+
+        # NVSHMEM operations
+        OpImpl(op_name="init_nvshmem_backend", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="create_nvshmem_tensor", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="nvshmem_finalize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor="METAX", priority=100),
+
+        # Multi-tensor operations
+        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="METAX", priority=100),
+
+        # Communication overlap operations
+        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="create_comm_overlap", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="METAX", priority=100),
+        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="METAX", priority=100),
+
+        # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="METAX", priority=100),
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
index a79ca3016a..7270173f4b 100644
--- a/transformer_engine/plugin/core/builtin_ops.py
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -55,3 +55,12 @@ def register_builtins(registry: OpRegistry) -> None:
     except Exception as e:
         # HYGON may not be available, this is expected
         pass
+
+    # Register Metax (VENDOR) implementations
+    try:
+        from .backends.vendor.metax.register_ops import register_builtins as register_metax
+        register_metax(registry)
+    except Exception as e:
+        # Metax may not be available, this is expected
+        pass
+

From 03d199828356797255d13076cc904db50e74f18a Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Wed, 21 Jan 2026 16:26:10 +0800
Subject: [PATCH 27/59] Add multi_tensor_adam_param_remainder and context
 parallel support (#23)

## Summary
- flagos: Add multi_tensor_adam_param_remainder implementation
- reference: Add multi_tensor_adam_param_remainder implementation
- reference: Add context parallel support for Flash Attention
- manager: Add cache mechanism with _impl_cache and _impl_cache_meta for
conditional op selection

## Changes
### flagos backend
- Implemented multi_tensor_adam_param_remainder operation for handling
parameter remainders in multi-tensor Adam optimizer

### reference backend
- Implemented multi_tensor_adam_param_remainder operation
- Added context parallel support for Flash Attention implementation

### Core manager
- Added cache mechanism using _impl_cache and _impl_cache_meta
- Improved op selection with conditional caching based on policy
fingerprint and epoch

---------

Signed-off-by: wenone766 <wenone766@wenobug.com>
Co-authored-by: wenone766 <wenone766@wenobug.com>
---
 .../plugin/core/backends/fa_utils.py          | 184 ++++++++++++++
 .../dot_product_attention/backends.py         |   2 +-
 .../plugin/core/backends/flagos/flagos.py     |  23 ++
 .../core/backends/flagos/impl/fused_adam.py   | 113 +++++++++
 .../core/backends/flagos/register_ops.py      |   1 +
 .../backends/reference/flash_attention.py     |  79 ++++--
 .../core/backends/reference/impl/__init__.py  |   2 +
 .../core/backends/reference/impl/optimizer.py | 110 +++++++++
 .../core/backends/reference/reference.py      |   9 +-
 .../backends/vendor/hygon/flash_attention.py  | 125 ++++++++++
 .../core/backends/vendor/hygon/hygon.py       |  40 ++-
 .../backends/vendor/hygon/register_ops.py     |   6 +
 transformer_engine/plugin/core/manager.py     | 226 +++++++++++++----
 transformer_engine/plugin/core/ops.py         | 232 +++---------------
 .../dot_product_attention/context_parallel.py |   3 +
 15 files changed, 884 insertions(+), 271 deletions(-)
 create mode 100644 transformer_engine/plugin/core/backends/fa_utils.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py

diff --git a/transformer_engine/plugin/core/backends/fa_utils.py b/transformer_engine/plugin/core/backends/fa_utils.py
new file mode 100644
index 0000000000..1107de757a
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/fa_utils.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Common utilities for Flash Attention backends with Context Parallelism support."""
+
+from typing import Any, Tuple
+
+import torch
+import torch.distributed as dist
+
+
+class AllGatherFunc(torch.autograd.Function):
+    """Autograd function for all-gather along sequence dimension with proper backward."""
+
+    @staticmethod
+    def forward(ctx, input_tensor: torch.Tensor, cp_group: Any, seq_dim: int) -> torch.Tensor:
+        world_size = dist.get_world_size(cp_group)
+        gathered_list = [torch.empty_like(input_tensor) for _ in range(world_size)]
+        dist.all_gather(gathered_list, input_tensor, group=cp_group)
+        ctx.cp_group = cp_group
+        ctx.world_size = world_size
+        ctx.seq_dim = seq_dim
+        return torch.cat(gathered_list, dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+        # Split the gradient and reduce_scatter
+        grad_chunks = torch.chunk(grad_output, ctx.world_size, dim=ctx.seq_dim)
+        local_grad = torch.zeros_like(grad_chunks[0])
+        grad_list = [chunk.contiguous() for chunk in grad_chunks]
+        dist.reduce_scatter(local_grad, grad_list, group=ctx.cp_group)
+        return local_grad, None, None
+
+
+def all_gather_along_seq(
+    tensor: torch.Tensor,
+    cp_group: Any,
+    seq_dim: int = 2,
+) -> torch.Tensor:
+    """All-gather tensor along sequence dimension across CP group.
+
+    Args:
+        tensor: Input tensor to gather.
+        cp_group: Context parallelism process group.
+        seq_dim: Sequence dimension (default: 2 for BHSD format).
+
+    Returns:
+        Gathered tensor with sequence dimension scaled by CP world size.
+    """
+    world_size = dist.get_world_size(cp_group)
+    if world_size == 1:
+        return tensor
+
+    tensor = tensor.contiguous()
+    return AllGatherFunc.apply(tensor, cp_group, seq_dim)
+
+
+def reduce_scatter_along_seq(
+    tensor: torch.Tensor,
+    cp_group: Any,
+    seq_dim: int = 2,
+) -> torch.Tensor:
+    """Reduce-scatter tensor along sequence dimension across CP group.
+
+    Args:
+        tensor: Input tensor to reduce-scatter.
+        cp_group: Context parallelism process group.
+        seq_dim: Sequence dimension (default: 2 for BHSD format).
+
+    Returns:
+        Reduced tensor with sequence dimension divided by CP world size.
+    """
+    world_size = dist.get_world_size(cp_group)
+    if world_size == 1:
+        return tensor
+
+    tensor = tensor.contiguous()
+    seq_len = tensor.shape[seq_dim]
+    chunk_size = seq_len // world_size
+
+    output = torch.empty(
+        *tensor.shape[:seq_dim], chunk_size, *tensor.shape[seq_dim + 1:],
+        dtype=tensor.dtype, device=tensor.device
+    )
+
+    dist.reduce_scatter_tensor(output, tensor, group=cp_group)
+    return output
+
+
+def create_cp_causal_mask(
+    local_seq_len_q: int,
+    full_seq_len_kv: int,
+    cp_rank: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Create causal mask for context parallelism.
+
+    In CP mode, each rank processes a different chunk of the query sequence,
+    so the causal mask needs to account for global positions.
+
+    Args:
+        local_seq_len_q: Local query sequence length (per rank).
+        full_seq_len_kv: Full key/value sequence length (after all-gather).
+        cp_rank: Current rank in CP group.
+        device: Device to create mask on.
+        dtype: Data type for mask.
+
+    Returns:
+        Causal mask tensor of shape [local_seq_len_q, full_seq_len_kv].
+    """
+    # Calculate global query position offset
+    q_start = cp_rank * local_seq_len_q
+
+    # Create position indices
+    q_indices = torch.arange(local_seq_len_q, device=device, dtype=torch.long).unsqueeze(1) + q_start
+    kv_indices = torch.arange(full_seq_len_kv, device=device, dtype=torch.long).unsqueeze(0)
+
+    # Create causal mask: mask out positions where kv_idx > q_idx
+    causal_mask = torch.zeros(local_seq_len_q, full_seq_len_kv, dtype=dtype, device=device)
+    causal_mask.masked_fill_(kv_indices > q_indices, float('-inf'))
+
+    return causal_mask
+
+
+def create_cp_window_mask(
+    local_seq_len_q: int,
+    full_seq_len_kv: int,
+    cp_rank: int,
+    window_size: Tuple[int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Create sliding window mask for context parallelism.
+
+    Args:
+        local_seq_len_q: Local query sequence length (per rank).
+        full_seq_len_kv: Full key/value sequence length (after all-gather).
+        cp_rank: Current rank in CP group.
+        window_size: Tuple of (left_window, right_window). -1 means no limit.
+        device: Device to create mask on.
+        dtype: Data type for mask.
+
+    Returns:
+        Window mask tensor of shape [local_seq_len_q, full_seq_len_kv].
+    """
+    left_window, right_window = window_size
+
+    # Calculate global query position offset
+    q_start = cp_rank * local_seq_len_q
+
+    # Create position indices
+    q_indices = torch.arange(local_seq_len_q, device=device, dtype=torch.long).unsqueeze(1) + q_start
+    kv_indices = torch.arange(full_seq_len_kv, device=device, dtype=torch.long).unsqueeze(0)
+
+    # Create window mask
+    window_mask = torch.zeros(local_seq_len_q, full_seq_len_kv, dtype=dtype, device=device)
+
+    if left_window >= 0:
+        window_mask.masked_fill_(kv_indices < q_indices - left_window, float('-inf'))
+    if right_window >= 0:
+        window_mask.masked_fill_(kv_indices > q_indices + right_window, float('-inf'))
+
+    return window_mask
+
+
+def get_cp_info(cp_group: Any) -> Tuple[int, int, bool]:
+    """Get context parallelism information from process group.
+
+    Args:
+        cp_group: Context parallelism process group.
+
+    Returns:
+        Tuple of (cp_size, cp_rank, use_cp).
+    """
+    if cp_group is None:
+        return 1, 0, False
+
+    cp_size = dist.get_world_size(cp_group)
+    cp_rank = dist.get_rank(cp_group)
+    use_cp = cp_size > 1
+
+    return cp_size, cp_rank, use_cp
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index 30596435db..ea3c9c002a 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -381,4 +381,4 @@ def _forward_impl(
                 self.layer_number,
             )
 
-        return output.view(*output.shape[:-2], -1)
+        return output.view(*output.shape[:-2], -1)
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index 22d36e9e21..ecdc73b33a 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -12,6 +12,7 @@
 from .impl import (
     rmsnorm_fwd_fl, rmsnorm_bwd_fl,
     multi_tensor_scale_fl, multi_tensor_adam_fl,
+    multi_tensor_adam_param_remainder_fl,
     multi_tensor_l2_norm_fl,
     generic_gemm_fl
 )
@@ -171,6 +172,28 @@ def multi_tensor_adam(
             step=step, mode=mode, bias_correction=bias_correction, weight_decay=weight_decay,
         )
 
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        if chunk_size is None:
+            return multi_tensor_adam_param_remainder_fl
+        return multi_tensor_adam_param_remainder_fl(
+            chunk_size=chunk_size, noop_flag=noop_flag, tensor_lists=tensor_lists,
+            lr=lr, beta1=beta1, beta2=beta2, eps=eps,
+            step=step, mode=mode, bias_correction=bias_correction, weight_decay=weight_decay,
+        )
+
     def get_cublasLt_version(self) -> int:
         return 110000
 
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index bd4b916010..93ba067e93 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -75,3 +75,116 @@ def multi_tensor_adam_fl(
             flag_gems.copy_(p_master, p)
             out_dtype = p_master.dtype if out_dtype is None else out_dtype
             p.data = p.data.to(out_dtype)
+
+
+def multi_tensor_adam_param_remainder_fl(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    lr: float,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    mode: int,
+    bias_correction: int,
+    weight_decay: float,
+    inv_scale: Optional[float] = 1.0,
+) -> None:
+    """
+    Adam optimizer with parameter remainders for BF16 precision (FlagOS implementation).
+
+    This variant stores BF16 parameters + int16 remainders to reconstruct FP32 master weights.
+    Used when you have BF16 params and need FP32 master params without storing full FP32 copies.
+
+    Args:
+        chunk_size: Chunk size for processing (unused in this implementation)
+        noop_flag: If non-zero, skip computation
+        tensor_lists: [grads, params (bf16), exp_avgs (fp32), exp_avg_sqs (fp32), param_remainders (int16)]
+        lr: Learning rate
+        beta1: First moment decay rate
+        beta2: Second moment decay rate
+        eps: Epsilon for numerical stability
+        step: Current optimization step
+        mode: 0 = L2 regularization, 1 = AdamW (decoupled weight decay)
+        bias_correction: Whether to apply bias correction (1 = yes, 0 = no)
+        weight_decay: Weight decay coefficient
+        inv_scale: Inverse gradient scale for mixed precision training
+    """
+    if noop_flag.item() != 0:
+        return
+
+    num_lists = len(tensor_lists)
+    assert num_lists == 5, f"Expected 5 tensor lists, got {num_lists}"
+
+    num_tensors = len(tensor_lists[0])
+    assert num_tensors > 0, "No tensors provided"
+
+    for i, lst in enumerate(tensor_lists):
+        assert len(lst) == num_tensors, f"List {i} has {len(lst)} tensors, expected {num_tensors}"
+
+    bias_correction1 = 1.0
+    bias_correction2 = 1.0
+    if bias_correction == 1:
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+    is_adamw = (mode == 1)
+
+    for i in range(num_tensors):
+        g = tensor_lists[0][i]
+        p = tensor_lists[1][i]  # BF16 parameter
+        m = tensor_lists[2][i]  # FP32 first moment
+        v = tensor_lists[3][i]  # FP32 second moment
+        p_remainder = tensor_lists[4][i]  # int16 remainder
+
+        if not g.is_contiguous():
+            g = g.contiguous()
+
+        # Apply gradient unscaling if needed
+        if inv_scale is not None and inv_scale != 1.0:
+            g = flag_gems.mul(g, inv_scale)
+
+        # Reconstruct FP32 master weight from BF16 param + int16 remainder
+        # The remainder represents the lower 16 bits lost in BF16 conversion
+        param_fp32 = p.float()
+        param_master = flag_gems.add(param_fp32, flag_gems.mul(p_remainder.float(), 2.0 ** -16))
+
+        # Compute gradient with weight decay (if L2 mode)
+        grad_with_decay = g.float()
+        if not is_adamw:  # L2 regularization mode
+            grad_with_decay = flag_gems.add(grad_with_decay, flag_gems.mul(param_master, weight_decay))
+
+        # Update moments
+        m = flag_gems.add_(flag_gems.mul_(m, beta1), grad_with_decay, alpha=1 - beta1)
+        v = flag_gems.add_(flag_gems.mul_(v, beta2), flag_gems.mul_(flag_gems.mul_(grad_with_decay, grad_with_decay), 1 - beta2))
+
+        # Apply bias correction
+        m_corr = m.clone()
+        v_corr = v.clone()
+        if bias_correction == 1:
+            m_corr = flag_gems.true_divide(m_corr, bias_correction1)
+            v_corr = flag_gems.true_divide(v_corr, bias_correction2)
+
+        # Compute update
+        update = flag_gems.true_divide(m_corr, flag_gems.add(flag_gems.sqrt(v_corr), eps))
+
+        # Apply weight decay (if AdamW mode)
+        if is_adamw:
+            param_master = flag_gems.mul_(param_master, 1 - lr * weight_decay)
+
+        # Update master weight
+        param_master = flag_gems.add_(param_master, update, alpha=-lr)
+
+        # Split back into BF16 param + int16 remainder
+        # Convert to BF16 (this is the rounded version)
+        param_bf16 = param_master.to(dtype=p.dtype)
+
+        # Compute remainder: difference between FP32 master and BF16 representation
+        # Scale and quantize to int16 range
+        remainder_fp32 = flag_gems.mul(flag_gems.sub(param_master, param_bf16.float()), 2.0 ** 16)
+        remainder_int16 = flag_gems.clamp(torch.round(remainder_fp32), -32768, 32767).to(dtype=torch.int16)
+
+        # Write back
+        flag_gems.copy_(p, param_bf16)
+        flag_gems.copy_(p_remainder, remainder_int16)
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
index 1286f5b3a9..e92e0864e0 100644
--- a/transformer_engine/plugin/core/backends/flagos/register_ops.py
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -45,6 +45,7 @@ def register_builtins(registry) -> None:
         OpImpl(op_name="generic_gemm", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor=None, priority=150),
         OpImpl(op_name="multi_tensor_scale", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor=None, priority=150),
         OpImpl(op_name="multi_tensor_adam", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor=None, priority=150),
+        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor=None, priority=150),
         OpImpl(op_name="multi_tensor_l2norm", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor=None, priority=150),
 
         # FlashAttention class getter
diff --git a/transformer_engine/plugin/core/backends/reference/flash_attention.py b/transformer_engine/plugin/core/backends/reference/flash_attention.py
index 60e4bc1bb9..62c652b856 100644
--- a/transformer_engine/plugin/core/backends/reference/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/reference/flash_attention.py
@@ -7,8 +7,16 @@
 
 import torch
 import torch.nn.functional as F
+import torch.distributed as dist
 
 from transformer_engine.plugin.core.ops import FlashAttentionBase
+from transformer_engine.plugin.core.backends.fa_utils import (
+    all_gather_along_seq,
+    reduce_scatter_along_seq,
+    create_cp_causal_mask,
+    create_cp_window_mask,
+    get_cp_info,
+)
 
 
 class FlashAttentionTorch(FlashAttentionBase):
@@ -151,9 +159,11 @@ def _unpack_tensor(
 
         padding_mask = torch.ones(batch_size, max_seqlen, dtype=torch.bool, device=device)
 
+        # Vectorized unpacking - avoid Python loop and .item() calls
+        cu_seqlens_cpu = cu_seqlens.cpu()
         for i in range(batch_size):
-            start = cu_seqlens[i].item()
-            end = cu_seqlens[i + 1].item()
+            start = cu_seqlens_cpu[i].item()
+            end = cu_seqlens_cpu[i + 1].item()
             seq_len = end - start
 
             seq_data = tensor[start:end].permute(1, 0, 2)
@@ -179,9 +189,11 @@ def _pack_tensor(
             dtype=tensor.dtype, device=device
         )
 
+        # Vectorized packing - avoid repeated .item() calls
+        cu_seqlens_cpu = cu_seqlens.cpu()
         for i in range(batch_size):
-            start = cu_seqlens[i].item()
-            end = cu_seqlens[i + 1].item()
+            start = cu_seqlens_cpu[i].item()
+            end = cu_seqlens_cpu[i + 1].item()
             seq_len = end - start
 
             seq_data = tensor[i, :, :seq_len, :].permute(1, 0, 2)
@@ -214,23 +226,22 @@ def _forward_impl(
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
     ) -> torch.Tensor:
-        """Flash Attention implementation using PyTorch's scaled_dot_product_attention."""
+        """Flash Attention implementation using PyTorch's scaled_dot_product_attention.
+
+        Supports Context Parallelism (CP) by all-gathering key/value across the CP group.
+        """
         if fp8:
             raise NotImplementedError("FP8 is not supported in PyTorch SDPA backend")
-        if cp_group is not None:
-            raise NotImplementedError("Context parallelism is not supported in PyTorch SDPA backend")
+
         if alibi_slopes is not None:
             raise NotImplementedError("ALiBi slopes are not supported in PyTorch SDPA backend")
 
         query_original_shape = query_layer.shape
+        cp_size, cp_rank, use_cp = get_cp_info(cp_group)
 
-        # Check if input is in standard 4D format - same as flagos backend
-        # If tensor is 4D, treat it as standard format and just do layout conversion
-        # Only use unpack logic for true packed format (3D tensors with thd layout)
         is_standard_4d = query_layer.dim() == 4
 
         if is_standard_4d:
-            # Standard 4D tensor format - just convert layout like flagos does
             query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
             key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
             value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
@@ -238,7 +249,6 @@ def _forward_impl(
             padding_mask_q = None
             padding_mask_kv = None
         else:
-            # True packed format (thd layout, 3D tensor) - use unpack logic
             use_packed_format = cu_seqlens_q is not None or cu_seqlens_kv is not None
             padding_mask_q = None
             padding_mask_kv = None
@@ -261,6 +271,13 @@ def _forward_impl(
                 value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
 
         batch_size, num_heads_q, seq_len_q, head_dim = query.shape
+        local_seq_len_q = seq_len_q
+
+        if use_cp:
+            # All-gather key/value along sequence dimension for full context
+            key = all_gather_along_seq(key, cp_group, seq_dim=2)
+            value = all_gather_along_seq(value, cp_group, seq_dim=2)
+
         num_heads_kv = key.shape[1]
         seq_len_kv = key.shape[2]
 
@@ -285,7 +302,19 @@ def _forward_impl(
             attn_mask.masked_fill_(padding_broadcast, float('-inf'))
 
         if attn_mask_type == "causal":
-            if window_size is None and not use_packed_format:
+            if use_cp:
+                # Use shared utility for CP causal mask creation
+                causal_mask = create_cp_causal_mask(
+                    local_seq_len_q, seq_len_kv, cp_rank, query.device, query.dtype
+                )
+                if attn_mask is not None:
+                    if attn_mask.dim() == 2:
+                        attn_mask = attn_mask + causal_mask
+                    else:
+                        attn_mask = attn_mask + causal_mask.unsqueeze(0)
+                else:
+                    attn_mask = causal_mask
+            elif window_size is None and not use_packed_format:
                 is_causal = True
             else:
                 causal_mask = torch.zeros(
@@ -306,16 +335,22 @@ def _forward_impl(
                     attn_mask = causal_mask
 
         if window_size is not None and not is_causal:
-            window_mask = self._create_sliding_window_mask(
-                seq_len_q=seq_len_q,
-                seq_len_kv=seq_len_kv,
-                window_size=window_size,
-                device=query.device,
-                dtype=query.dtype,
-            )
+            if use_cp:
+                # Use shared utility for CP window mask creation
+                window_mask = create_cp_window_mask(
+                    local_seq_len_q, seq_len_kv, cp_rank, window_size, query.device, query.dtype
+                )
+            else:
+                window_mask = self._create_sliding_window_mask(
+                    seq_len_q=seq_len_q,
+                    seq_len_kv=seq_len_kv,
+                    window_size=window_size,
+                    device=query.device,
+                    dtype=query.dtype,
+                )
 
             if attn_mask is not None:
-                attn_mask = attn_mask + window_mask.unsqueeze(0)
+                attn_mask = attn_mask + window_mask.unsqueeze(0) if window_mask.dim() == 2 else attn_mask + window_mask
             else:
                 attn_mask = window_mask
 
@@ -375,8 +410,6 @@ def _forward_impl(
                 output = output.contiguous().view(total_tokens, 1, hidden_size)
         else:
             output = self._convert_bhsd_to_layout(output, qkv_layout)
-            # Flatten the last two dimensions (heads, dim) -> (heads * dim)
-            # to match the output format of other backends
             output = output.contiguous().view(*output.shape[:-2], -1)
 
         return output
diff --git a/transformer_engine/plugin/core/backends/reference/impl/__init__.py b/transformer_engine/plugin/core/backends/reference/impl/__init__.py
index 6eb29b6f90..43d73e95c5 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/__init__.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/__init__.py
@@ -35,6 +35,7 @@
     multi_tensor_scale_torch,
     multi_tensor_l2norm_torch,
     multi_tensor_adam_torch,
+    multi_tensor_adam_param_remainder_torch,
     multi_tensor_sgd_torch,
     multi_tensor_compute_scale_and_scale_inv_torch,
 )
@@ -85,6 +86,7 @@
     "multi_tensor_scale_torch",
     "multi_tensor_l2norm_torch",
     "multi_tensor_adam_torch",
+    "multi_tensor_adam_param_remainder_torch",
     "multi_tensor_sgd_torch",
     "multi_tensor_compute_scale_and_scale_inv_torch",
 ]
diff --git a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
index 100c6c9ef3..0ae0809dcc 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
@@ -9,6 +9,7 @@
     "multi_tensor_scale_torch",
     "multi_tensor_l2norm_torch",
     "multi_tensor_adam_torch",
+    "multi_tensor_adam_param_remainder_torch",
     "multi_tensor_sgd_torch",
     "multi_tensor_compute_scale_and_scale_inv_torch",
 ]
@@ -111,6 +112,115 @@ def multi_tensor_adam_torch(
         param.addcdiv_(corrected_exp_avg, denom, value=-lr)
 
 
+def multi_tensor_adam_param_remainder_torch(
+    chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    lr: float,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    mode: int,
+    bias_correction: int,
+    weight_decay: float,
+) -> None:
+    """
+    Adam optimizer with parameter remainders for BF16 precision.
+
+    This variant stores BF16 parameters + int16 remainders to reconstruct FP32 master weights.
+    Used when you have BF16 params and need FP32 master params without storing full FP32 copies.
+
+    Args:
+        chunk_size: Chunk size for processing (unused in PyTorch implementation)
+        noop_flag: If non-zero, skip computation
+        tensor_lists: [grads, params (bf16), exp_avgs (fp32), exp_avg_sqs (fp32), param_remainders (int16)]
+        lr: Learning rate
+        beta1: First moment decay rate
+        beta2: Second moment decay rate
+        eps: Epsilon for numerical stability
+        step: Current optimization step
+        mode: 0 = L2 regularization, 1 = AdamW (decoupled weight decay)
+        bias_correction: Whether to apply bias correction (1 = yes, 0 = no)
+        weight_decay: Weight decay coefficient
+    """
+    if noop_flag.item() != 0:
+        return
+
+    if len(tensor_lists) != 5:
+        raise ValueError(
+            "tensor_lists should contain [grads, params, exp_avgs, exp_avg_sqs, param_remainders]"
+        )
+
+    grads, params, exp_avgs, exp_avg_sqs, param_remainders = tensor_lists
+
+    if not (len(params) == len(grads) == len(exp_avgs) == len(exp_avg_sqs) == len(param_remainders)):
+        raise ValueError("All tensor lists must have the same length")
+
+    if bias_correction:
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+    else:
+        bias_correction1 = 1.0
+        bias_correction2 = 1.0
+
+    for grad, param, exp_avg, exp_avg_sq, param_remainder in zip(
+        grads, params, exp_avgs, exp_avg_sqs, param_remainders
+    ):
+        if grad is None:
+            continue
+
+        # Reconstruct FP32 master weight from BF16 param + int16 remainder
+        # The CUDA implementation uses bit manipulation to combine them
+        # In PyTorch, we approximate this by:
+        # 1. Convert param (bf16) to fp32 - this gives us the high-precision bits
+        # 2. Add the remainder scaled appropriately
+        param_fp32 = param.float()
+
+        # The remainder represents the lower 16 bits lost in BF16 conversion
+        # We need to scale it back to the proper magnitude
+        # BF16 has 16 bits total (1 sign, 8 exponent, 7 mantissa)
+        # The remainder compensates for the lost precision
+        param_master = param_fp32 + param_remainder.float() * (2.0 ** -16)
+
+        # Standard Adam update on FP32 master weight
+        if mode == 0:  # L2 regularization
+            grad_with_decay = grad.float() + weight_decay * param_master
+        else:  # mode == 1, AdamW
+            grad_with_decay = grad.float()
+
+        # Update moments
+        exp_avg.mul_(beta1).add_(grad_with_decay, alpha=1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad_with_decay, grad_with_decay, value=1 - beta2)
+
+        # Apply bias correction
+        corrected_exp_avg = exp_avg / bias_correction1
+        corrected_exp_avg_sq = exp_avg_sq / bias_correction2
+
+        # Compute update
+        denom = corrected_exp_avg_sq.sqrt().add_(eps)
+        update = corrected_exp_avg / denom
+
+        if mode == 1:  # AdamW: apply weight decay directly
+            update = update + weight_decay * param_master
+
+        # Update master weight
+        param_master.add_(update, alpha=-lr)
+
+        # Split back into BF16 param + int16 remainder
+        # Convert to BF16 (this is the rounded version)
+        param_bf16 = param_master.to(dtype=param.dtype)
+
+        # Compute remainder: difference between FP32 master and BF16 representation
+        # Scale and quantize to int16 range
+        remainder_fp32 = (param_master - param_bf16.float()) * (2.0 ** 16)
+        remainder_int16 = remainder_fp32.round().clamp(-32768, 32767).to(dtype=torch.int16)
+
+        # Write back
+        param.copy_(param_bf16)
+        param_remainder.copy_(remainder_int16)
+
+
 def multi_tensor_sgd_torch(
     chunk_size: int,
     noop_flag: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
index 61a0bdaab5..3f29cf89be 100644
--- a/transformer_engine/plugin/core/backends/reference/reference.py
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -29,7 +29,8 @@
     scaled_aligned_causal_masked_softmax_backward_torch,
     dropout_fwd_torch, dropout_bwd_torch,
     multi_tensor_scale_torch, multi_tensor_l2norm_torch,
-    multi_tensor_adam_torch, multi_tensor_sgd_torch,
+    multi_tensor_adam_torch, multi_tensor_adam_param_remainder_torch,
+    multi_tensor_sgd_torch,
 )
 
 class ReferenceBackend(TEFLBackendBase):
@@ -506,8 +507,10 @@ def multi_tensor_adam(self, *args, **kwargs):
             return multi_tensor_adam_torch
         return multi_tensor_adam_torch(*args, **kwargs)
 
-    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
-        raise NotImplementedError("multi_tensor_adam_param_remainder - not implemented in reference backend")
+    def multi_tensor_adam_param_remainder(self, *args, **kwargs):
+        if not args and not kwargs:
+            return multi_tensor_adam_param_remainder_torch
+        return multi_tensor_adam_param_remainder_torch(*args, **kwargs)
 
     def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
         raise NotImplementedError("multi_tensor_adam_fp8 - not implemented in reference backend")
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
new file mode 100644
index 0000000000..831a83181c
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+
+class FlashAttentionHYGON(FlashAttentionBase):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+        # Store initialization parameters for lazy loading
+        self._init_params = {
+            'softmax_scale': softmax_scale,
+            'attention_dropout': attention_dropout,
+            'attention_dropout_ctx': attention_dropout_ctx or nullcontext,
+            'attention_type': attention_type,
+            'layer_number': layer_number,
+            'deterministic': deterministic,
+        }
+        self._native_flash_attn = None
+
+    def _ensure_native_flash_attn(self):
+        """Lazy initialization of native FlashAttention."""
+        if self._native_flash_attn is not None:
+            return
+
+        try:
+            # Import here to avoid circular dependency issues
+            # transformer_engine_torch must be registered before this import
+            from transformer_engine.pytorch.attention.dot_product_attention.backends import (
+                FlashAttention as FlashAttentionNative,
+            )
+
+            if FlashAttentionNative is None:
+                raise RuntimeError("FlashAttention class is None - flash-attn may not be installed correctly")
+
+            self._native_flash_attn = FlashAttentionNative(**self._init_params)
+
+        except ImportError as e:
+            raise RuntimeError(
+                f"Failed to import native FlashAttention: {e}. "
+                "Please ensure flash-attn is installed and transformer_engine_torch is available."
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to initialize native FlashAttention: {e}. "
+                f"Init params: {self._init_params}"
+            )
+
+    @property
+    def backend_name(self) -> str:
+        return "hygon"
+
+    def _forward_impl(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        # Ensure native flash attention is initialized
+        self._ensure_native_flash_attn()
+
+        return self._native_flash_attn(
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            qkv_layout=qkv_layout,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_kv=max_seqlen_kv,
+            attn_mask_type=attn_mask_type,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            cp_group=cp_group,
+            cp_global_ranks=cp_global_ranks,
+            cp_stream=cp_stream,
+            cp_comm_type=cp_comm_type,
+            fp8=fp8,
+            fp8_meta=fp8_meta,
+            quantizers=quantizers,
+            inference_params=inference_params,
+            flash_attention_backend=flash_attention_backend,
+            fp8_output=fp8_output,
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index 4d74e2f4cf..92e8868ed9 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -2,19 +2,19 @@
 #
 # See LICENSE for license information.
 
+import os
+import sys
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
-import sys
 
-from ....ops import TEFLBackendBase, FP8TensorMeta
+from ....ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
 
 def _load_hygon_libs():
     import ctypes
     from pathlib import Path
     import importlib
     import platform
-    import os
     common_prefix = "libtransformer_engine"
     csrc_prefix = "transformer_engine_torch_hygon"
     common_files = []
@@ -161,10 +161,40 @@ def is_available(self) -> bool:
         return _check_hygon_available()
 
     def get_flash_attention_class(self):
-        raise NotImplementedError("get_flash_attention_class - not implemented in hygon backend")
+        from .flash_attention import FlashAttentionHYGON
+        return FlashAttentionHYGON
 
     def get_attention_backend(self, attention_params=None):
-        raise NotImplementedError("get_attention_backend - not implemented in hygon backend")
+        from packaging.version import Version as PkgVersion
+        from ....logger_manager import get_logger
+        logger = get_logger()
+
+        # Read environment variables to determine which backends to enable
+        use_flash_attention = int(os.getenv("NVTE_FLASH_ATTN", "1"))
+        use_fused_attention = int(os.getenv("NVTE_FUSED_ATTN", "1"))
+        use_unfused_attention = int(os.getenv("NVTE_UNFUSED_ATTN", "1"))
+
+        # Log disabled backends
+        if not use_flash_attention:
+            logger.info_once("Disabling FlashAttention due to NVTE_FLASH_ATTN=0")
+        if not use_fused_attention:
+            logger.info_once("Disabling FusedAttention due to NVTE_FUSED_ATTN=0")
+        if not use_unfused_attention:
+            logger.info_once("Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0")
+
+        flash_attention_backend = PkgVersion("2.6.0") if use_flash_attention else None
+        fused_attention_backend = NVTE_Fused_Attn_Backend.NVTE_No_Backend
+
+        available_backends = [use_flash_attention, use_fused_attention, use_unfused_attention]
+
+        return (
+            use_flash_attention,
+            flash_attention_backend,
+            use_fused_attention,
+            fused_attention_backend,
+            use_unfused_attention,
+            available_backends,
+        )
 
     def quantize(
         self,
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
index 59cbe0ac5d..6000eff69c 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
@@ -112,6 +112,8 @@ def register_builtins(registry) -> None:
         OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="HYGON", priority=100),
 
         # Fused attention
+        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="HYGON", priority=100),
+        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="HYGON", priority=100),
 
         # KV cache
         OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="HYGON", priority=100),
@@ -186,6 +188,10 @@ def register_builtins(registry) -> None:
         OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="HYGON", priority=100),
 
         # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="HYGON", priority=100),
+
+        # Attention backend selection
+        OpImpl(op_name="get_attention_backend", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="HYGON", priority=100),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/manager.py b/transformer_engine/plugin/core/manager.py
index cd96b35bb0..66a9ad8d9b 100644
--- a/transformer_engine/plugin/core/manager.py
+++ b/transformer_engine/plugin/core/manager.py
@@ -7,7 +7,7 @@
 import os
 import threading
 from dataclasses import dataclass
-from typing import Callable, Dict, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple, Any
 
 from .discovery import discover_plugin
 from .registry import OpRegistry
@@ -42,7 +42,8 @@ def __init__(self, registry: Optional[OpRegistry] = None) -> None:
         self._registry = registry or OpRegistry()
         self._state = _OpManagerState()
         self._dispatch_cache: Dict[Tuple[str, str, int], Callable] = {}
-        self._called_ops: Dict[str, str] = {}  # Map op_name -> last_used_impl_id (for logging)
+        self._impl_cache: Dict[str, OpImpl] = {}
+        self._impl_cache_meta: Dict[str, Tuple[str, int]] = {}
 
         # Register at_fork handler for multi-process safety
         try:
@@ -63,7 +64,8 @@ def _reset_after_fork(self) -> None:
             self._state.init_pid = -1
             self._state.policy_epoch += 1
             self._dispatch_cache.clear()
-            self._called_ops.clear()
+            self._impl_cache.clear()
+            self._impl_cache_meta.clear()
             logger.debug("OpManager reset after fork")
 
     def bump_policy_epoch(self) -> None:
@@ -320,14 +322,36 @@ def resolve_candidates(self, op_name: str) -> list[OpImpl]:
 
         return unique_candidates
 
+    def _is_cache_valid(self, op_name: str) -> bool:
+        """Check if cached impl is still valid for current policy"""
+        meta = self._impl_cache_meta.get(op_name)
+        if meta is None:
+            return False
+        cached_fp, cached_epoch = meta
+        policy = get_policy()
+        return cached_fp == policy.fingerprint() and cached_epoch == self._state.policy_epoch
+
+    def _update_cache(self, op_name: str, impl: OpImpl) -> None:
+        """Update cache with new impl"""
+        policy = get_policy()
+        self._impl_cache[op_name] = impl
+        self._impl_cache_meta[op_name] = (policy.fingerprint(), self._state.policy_epoch)
+
+    def _invalidate_cache(self, op_name: str) -> None:
+        """Invalidate cache for an op"""
+        self._impl_cache.pop(op_name, None)
+        self._impl_cache_meta.pop(op_name, None)
+
+    def _get_last_impl_id(self, op_name: str) -> Optional[str]:
+        """Get last used impl_id (even if cache is stale)"""
+        impl = self._impl_cache.get(op_name)
+        return impl.impl_id if impl else None
+
     def call(self, op_name: str, *args, **kwargs):
         """
         Resolve and call an operator implementation with optional fallback support.
 
-        When TE_FL_STRICT=1, this method will try alternative implementations
-        if the primary one fails. Otherwise, it behaves like the original implementation.
-
-        Logs on first call or when the implementation changes (e.g., backend switch).
+        Logs on first call or when the implementation changes.
 
         Args:
             op_name: Name of the operator
@@ -337,42 +361,49 @@ def call(self, op_name: str, *args, **kwargs):
             Result from the implementation
 
         Raises:
-            RuntimeError: If all implementations fail (when fallback enabled) or
-                         if the primary implementation fails (when fallback disabled)
+            RuntimeError: If all implementations fail
         """
         enable_fallback = os.getenv("TE_FL_STRICT", "1") != "0"
 
+        cached_impl = self._impl_cache.get(op_name)
+        cache_valid = self._is_cache_valid(op_name)
+
+        if cache_valid and cached_impl is not None:
+            try:
+                return cached_impl.fn(*args, **kwargs)
+            except Exception as e:
+                if enable_fallback:
+                    logger.warning_once(
+                        f"Cached implementation '{cached_impl.impl_id}' failed for op '{op_name}': {e}"
+                    )
+                    self._invalidate_cache(op_name)
+                else:
+                    raise
+
+        last_impl_id = self._get_last_impl_id(op_name)
+
         if not enable_fallback:
-            # Original behavior: use cached resolve() and fast-fail
             fn = self.resolve(op_name)
 
-            # Get current impl_id and log
-            impl_id = self.get_selected_impl_id(op_name)
-            last_impl_id = self._called_ops.get(op_name)
-
-            # Get impl details for logging
             snap = self._registry.snapshot()
-            for impl in snap.impls_by_op.get(op_name, []):
-                if impl.impl_id == impl_id:
-                    # Only log if first time or implementation actually changed
+            for candidate in snap.impls_by_op.get(op_name, []):
+                if candidate.fn is fn:
+                    self._update_cache(op_name, candidate)
+
                     if last_impl_id is None:
                         logger.info_once(
-                            f"Op '{op_name}' using '{impl_id}' "
-                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                            f"Op '{op_name}' using '{candidate.impl_id}' "
+                            f"(kind={candidate.kind.value}, vendor={candidate.vendor})"
                         )
-                    elif last_impl_id != impl_id:
+                    elif last_impl_id != candidate.impl_id:
                         logger.info_once(
-                            f"Op '{op_name}' switched from '{last_impl_id}' to '{impl_id}' "
-                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                            f"Op '{op_name}' switched from '{last_impl_id}' to '{candidate.impl_id}' "
+                            f"(kind={candidate.kind.value}, vendor={candidate.vendor})"
                         )
                     break
 
-            # Update tracking
-            self._called_ops[op_name] = impl_id
-
             return fn(*args, **kwargs)
 
-        # Fallback mode: try candidates in priority order
         candidates = self.resolve_candidates(op_name)
         last_error = None
 
@@ -380,46 +411,155 @@ def call(self, op_name: str, *args, **kwargs):
             try:
                 result = impl.fn(*args, **kwargs)
 
-                # Log on success
-                last_impl_id = self._called_ops.get(op_name)
-                if idx == 0:
-                    # Primary implementation - only log if first time or changed
-                    if last_impl_id is None:
+                self._update_cache(op_name, impl)
+
+                if last_impl_id is None:
+                    logger.info_once(
+                        f"Op '{op_name}' using '{impl.impl_id}' "
+                        f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                    )
+                elif last_impl_id != impl.impl_id:
+                    if idx == 0:
                         logger.info_once(
-                            f"Op '{op_name}' using '{impl.impl_id}' "
+                            f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
                             f"(kind={impl.kind.value}, vendor={impl.vendor})"
                         )
-                    elif last_impl_id != impl.impl_id:
+                    else:
                         logger.info_once(
-                            f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
+                            f"Op '{op_name}' fallback to '{impl.impl_id}' "
                             f"(kind={impl.kind.value}, vendor={impl.vendor})"
                         )
+
+                return result
+
+            except Exception as e:
+                last_error = e
+                if idx < len(candidates) - 1:
+                    logger.warning_once(
+                        f"Implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
+                    )
+                else:
+                    logger.error(
+                        f"Last implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
+                    )
+
+        raise RuntimeError(
+            f"All {len(candidates)} implementation(s) failed for op='{op_name}'. "
+            f"Last error: {last_error}"
+        ) from last_error
+
+    def call_with_custom_impl(
+        self,
+        op_name: str,
+        current_impl_class: type,
+        call_impl_fn: Callable[[type], Any],
+    ):
+        """
+        Call an operator with custom implementation class support (for FlashAttention).
+
+        Args:
+            op_name: Name of the operator
+            current_impl_class: The current implementation class
+            call_impl_fn: Function that takes impl_class and calls it
+
+        Returns:
+            Result from the implementation
+        """
+        enable_fallback = os.getenv("TE_FL_STRICT", "1") != "0"
+
+        cached_impl = self._impl_cache.get(op_name)
+        cache_valid = self._is_cache_valid(op_name)
+
+        if cache_valid and cached_impl is not None:
+            try:
+                cached_class = cached_impl.fn()
+                return call_impl_fn(cached_class)
+            except Exception as e:
+                if enable_fallback:
+                    logger.warning_once(
+                        f"Cached implementation '{cached_impl.impl_id}' failed for op '{op_name}': {e}"
+                    )
+                    self._invalidate_cache(op_name)
                 else:
-                    # Fallback succeeded
+                    raise
+
+        last_impl_id = self._get_last_impl_id(op_name)
+
+        if not enable_fallback:
+            snap = self._registry.snapshot()
+            for impl in snap.impls_by_op.get(op_name, []):
+                try:
+                    impl_class = impl.fn()
+                    if impl_class == current_impl_class:
+                        result = call_impl_fn(impl_class)
+
+                        self._update_cache(op_name, impl)
+
+                        if last_impl_id is None:
+                            logger.info_once(
+                                f"Op '{op_name}' using '{impl.impl_id}' "
+                                f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                            )
+                        elif last_impl_id != impl.impl_id:
+                            logger.info_once(
+                                f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
+                                f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                            )
+                        return result
+                except Exception:
+                    continue
+
+            return call_impl_fn(current_impl_class)
+
+        candidates = self.resolve_candidates(op_name)
+        last_error = None
+        current_impl_id = None
+
+        for impl in candidates:
+            try:
+                if impl.fn() == current_impl_class:
+                    current_impl_id = impl.impl_id
+                    break
+            except:
+                continue
+
+        for idx, impl in enumerate(candidates):
+            try:
+                impl_class = impl.fn()
+                result = call_impl_fn(impl_class)
+
+                self._update_cache(op_name, impl)
+
+                if last_impl_id is None:
                     logger.info_once(
-                        f"Op '{op_name}' fallback to '{impl.impl_id}' "
+                        f"Op '{op_name}' using '{impl.impl_id}' "
                         f"(kind={impl.kind.value}, vendor={impl.vendor})"
                     )
-
-                # Update tracking on success
-                self._called_ops[op_name] = impl.impl_id
+                elif last_impl_id != impl.impl_id:
+                    if impl.impl_id == current_impl_id or idx == 0:
+                        logger.info_once(
+                            f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
+                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                        )
+                    else:
+                        logger.info_once(
+                            f"Op '{op_name}' fallback to '{impl.impl_id}' "
+                            f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                        )
 
                 return result
 
             except Exception as e:
                 last_error = e
                 if idx < len(candidates) - 1:
-                    # Not the last candidate, log warning and try next
                     logger.warning_once(
                         f"Implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
                     )
                 else:
-                    # Last candidate failed, log error
                     logger.error(
                         f"Last implementation '{impl.impl_id}' failed for op '{op_name}': {e}"
                     )
 
-        # All implementations failed
         raise RuntimeError(
             f"All {len(candidates)} implementation(s) failed for op='{op_name}'. "
             f"Last error: {last_error}"
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 50ed6d72a4..1a11a46674 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -6,8 +6,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Type
 from enum import IntEnum
 from contextlib import nullcontext
-import os
-import traceback
 import torch
 
 from .logger_manager import get_logger
@@ -1069,8 +1067,6 @@ def create_comm_overlap_p2p(
         raise NotImplementedError
 
 class FlashAttentionBase(torch.nn.Module, ABC):
-    # Class-level tracking for last logged implementation
-    _last_impl_id: Optional[str] = None
 
     def __init__(
         self,
@@ -1153,49 +1149,10 @@ def forward(
         fp8_output: bool = False,
     ) -> torch.Tensor:
         """
-        Forward pass with automatic fallback support.
-        If TE_FL_STRICT=1 (default), this will automatically try alternative
-        implementations if the primary one fails.
+        Forward pass with automatic fallback support and caching.
+        Delegates to OpManager.call_with_custom_impl for unified dispatch.
         """
-        # Check if fallback is enabled
-        enable_fallback = os.getenv("TE_FL_STRICT", "1") != "0"
-
-        # Key for tracking this operation (use op name)
-        layer_key = "get_flash_attention_class"
-
-        # If no manager or fallback disabled, use direct implementation
-        if self._manager is None or not enable_fallback:
-            # Try to get implementation details from manager if available
-            if self._manager is not None:
-                snap = self._manager.registry.snapshot()
-                # Find the impl that matches this instance's class
-                class_name_lower = self.__class__.__name__.lower()
-                impl_id = None
-
-                for impl in snap.impls_by_op.get(layer_key, []):
-                    if impl.impl_id == class_name_lower or class_name_lower.startswith(impl.impl_id):
-                        impl_id = impl.impl_id
-                        break
-
-                # Log using info_once (it handles deduplication)
-                if impl_id is not None:
-                    for impl in snap.impls_by_op.get(layer_key, []):
-                        if impl.impl_id == impl_id:
-                            # Only log if first time or implementation actually changed
-                            if FlashAttentionBase._last_impl_id is None:
-                                logger.info_once(
-                                    f"Op '{layer_key}' using '{impl_id}' "
-                                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                                )
-                            elif FlashAttentionBase._last_impl_id != impl_id:
-                                logger.info_once(
-                                    f"Op '{layer_key}' switched from '{FlashAttentionBase._last_impl_id}' to '{impl_id}' "
-                                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                                )
-                            break
-                    # Update tracking
-                    FlashAttentionBase._last_impl_id = impl_id
-
+        if self._manager is None:
             return self._forward_impl(
                 query_layer=query_layer,
                 key_layer=key_layer,
@@ -1221,113 +1178,37 @@ def forward(
                 fp8_output=fp8_output,
             )
 
-        # Fallback mode: try candidates in priority order
-        candidates = []
-        try:
-            candidates = self._manager.resolve_candidates(layer_key)
-        except Exception as resolve_error:
-            logger.error(f"Failed to resolve fallback candidates: {resolve_error}")
-            # If we can't get candidates, just try the primary implementation
-            return self._forward_impl(
-                query_layer=query_layer,
-                key_layer=key_layer,
-                value_layer=value_layer,
-                attention_mask=attention_mask,
-                qkv_layout=qkv_layout,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_kv=cu_seqlens_kv,
-                max_seqlen_q=max_seqlen_q,
-                max_seqlen_kv=max_seqlen_kv,
-                attn_mask_type=attn_mask_type,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                cp_group=cp_group,
-                cp_global_ranks=cp_global_ranks,
-                cp_stream=cp_stream,
-                cp_comm_type=cp_comm_type,
-                fp8=fp8,
-                fp8_meta=fp8_meta,
-                quantizers=quantizers,
-                inference_params=inference_params,
-                flash_attention_backend=flash_attention_backend,
-                fp8_output=fp8_output,
-            )
-
-        # Find current implementation's impl_id
-        snap = self._manager.registry.snapshot()
-        current_impl_id = None
-        current_class = self.__class__
-
-        for impl in snap.impls_by_op.get(layer_key, []):
-            try:
-                # Check if this impl creates our current class
-                impl_class = impl.fn()
-                if impl_class == current_class:
-                    current_impl_id = impl.impl_id
-                    break
-            except:
-                continue
-
-        # Try primary implementation first and capture any error
-        primary_error = None
-        try:
-            result = self._forward_impl(
-                query_layer=query_layer,
-                key_layer=key_layer,
-                value_layer=value_layer,
-                attention_mask=attention_mask,
-                qkv_layout=qkv_layout,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_kv=cu_seqlens_kv,
-                max_seqlen_q=max_seqlen_q,
-                max_seqlen_kv=max_seqlen_kv,
-                attn_mask_type=attn_mask_type,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                cp_group=cp_group,
-                cp_global_ranks=cp_global_ranks,
-                cp_stream=cp_stream,
-                cp_comm_type=cp_comm_type,
-                fp8=fp8,
-                fp8_meta=fp8_meta,
-                quantizers=quantizers,
-                inference_params=inference_params,
-                flash_attention_backend=flash_attention_backend,
-                fp8_output=fp8_output,
-            )
-            # Primary implementation succeeded
-            return result
-        except Exception as e:
-            primary_error = e
-            # Log the primary failure
-            error_summary = f"{type(e).__name__}: {str(e)}"
-            logger.warning_once(
-                f"Implementation '{current_impl_id}' failed for op '{layer_key}' "
-                f" - {error_summary}"
-            )
-            # Log full traceback if verbose mode is enabled
-            if os.getenv("TE_FL_VERBOSE_ERROR", "0") == "1":
-                error_traceback = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
-                logger.warning(f"Detailed traceback for '{current_impl_id}':\n{error_traceback}")
-
-        last_error = primary_error
-
-        for idx, impl in enumerate(candidates):
-            # Skip the current implementation (already tried above)
-            if impl.impl_id == current_impl_id:
-                continue
-
-            try:
-                # All attempts here are fallbacks (since we skipped current impl)
-                # Get fallback class and create instance
-                fallback_class = impl.fn()
-                fallback_instance = fallback_class(**self._init_params)
-                # Set manager for nested fallback support
+        def call_impl_fn(impl_class):
+            if impl_class == self.__class__:
+                return self._forward_impl(
+                    query_layer=query_layer,
+                    key_layer=key_layer,
+                    value_layer=value_layer,
+                    attention_mask=attention_mask,
+                    qkv_layout=qkv_layout,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_kv=cu_seqlens_kv,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_kv=max_seqlen_kv,
+                    attn_mask_type=attn_mask_type,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    cp_group=cp_group,
+                    cp_global_ranks=cp_global_ranks,
+                    cp_stream=cp_stream,
+                    cp_comm_type=cp_comm_type,
+                    fp8=fp8,
+                    fp8_meta=fp8_meta,
+                    quantizers=quantizers,
+                    inference_params=inference_params,
+                    flash_attention_backend=flash_attention_backend,
+                    fp8_output=fp8_output,
+                )
+            else:
+                fallback_instance = impl_class(**self._init_params)
                 fallback_instance._manager = self._manager
                 fallback_instance._init_params = self._init_params
-
-                # Call the implementation directly (not forward, to avoid recursion)
-                result = fallback_instance._forward_impl(
+                return fallback_instance._forward_impl(
                     query_layer=query_layer,
                     key_layer=key_layer,
                     value_layer=value_layer,
@@ -1352,52 +1233,11 @@ def forward(
                     fp8_output=fp8_output,
                 )
 
-                # Log on fallback success
-                logger.info_once(
-                    f"Op '{layer_key}' fallback to '{impl.impl_id}' "
-                    f"(kind={impl.kind.value}, vendor={impl.vendor})"
-                )
-
-                # Update tracking on success
-                FlashAttentionBase._last_impl_id = impl.impl_id
-                return result
-
-            except Exception as e:
-                last_error = e
-                # Determine if there are more candidates to try
-                has_more_candidates = any(
-                    c.impl_id != current_impl_id
-                    for c in candidates[idx+1:]
-                )
-
-                # Format error summary
-                error_summary = f"{type(e).__name__}: {str(e)}"
-
-                if has_more_candidates:
-                    logger.warning_once(
-                        f"Implementation '{impl.impl_id}' failed for op '{layer_key}' - {error_summary}"
-                    )
-                else:
-                    # Last candidate failed
-                    logger.error_once(
-                        f"Last implementation '{impl.impl_id}' failed for op '{layer_key}' - {error_summary}"
-                    )
-
-                # Log full traceback if verbose mode is enabled
-                if os.getenv("TE_FL_VERBOSE_ERROR", "0") == "1":
-                    error_traceback = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
-                    log_func = logger.error if not has_more_candidates else logger.warning
-                    log_func(f"Detailed traceback for '{impl.impl_id}':\n{error_traceback}")
-
-        # All implementations failed
-        logger.error(
-            f"All implementations failed for op '{layer_key}'. "
-            f"Original: '{current_impl_id}'"
+        return self._manager.call_with_custom_impl(
+            op_name="get_flash_attention_class",
+            current_impl_class=self.__class__,
+            call_impl_fn=call_impl_fn,
         )
-        raise RuntimeError(
-            f"All implementation(s) failed for op='{layer_key}'. "
-            f"Last error: {last_error}"
-        ) from last_error
 
     @property
     def backend_name(self) -> str:
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
index a503147be8..e127d91595 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
@@ -1008,6 +1008,9 @@ def cp_p2p_bwd_flash_attn(
     dq, dk, dv = [torch.empty_like(x) for x in [q_part, k_part, v_part]]
     if use_flash_attn_3 or (fa_utils.v2_3_plus and not fa_utils.v2_7_0_plus):
         fa_backward_kwargs["window_size"] = (-1, -1)
+        # Fix: flash-attn 2.3.x ~ 2.6.x also needs rng_state for dropout
+        if not use_flash_attn_3 and rng_states is not None:
+            fa_backward_kwargs["rng_state"] = rng_states[cp_size - step - 1]
     elif fa_utils.v2_7_0_plus:
         fa_backward_kwargs["window_size_left"] = -1
         fa_backward_kwargs["window_size_right"] = -1

From 54390c706fe087f7854b4d377307c43455636b48 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Thu, 22 Jan 2026 18:50:17 +0800
Subject: [PATCH 28/59] Fix enum mismatch in plugins (#25)

- Fix enum mismatch, between ```transformer_engine/plugin/core/ops.py```
and ```transformer_engine/common/include/transformer_engine/xxx.h```
---
 transformer_engine/plugin/core/ops.py | 47 ++++++++++++++-------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 1a11a46674..988829b98c 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -13,29 +13,34 @@
 
 class DType(IntEnum):
     kByte = 0
+    kInt16 = 1
     kInt32 = 2
+    kInt64 = 3
     kFloat32 = 4
     kFloat16 = 5
     kBFloat16 = 6
     kFloat8E4M3 = 7
     kFloat8E5M2 = 8
+    kFloat8E8M0 = 9
     kFloat4E2M1 = 10
+    kNumTypes = 11
 
 class Float8BlockScaleTensorFormat(IntEnum):
-    COMPACT = 0
-    GEMM_READY = 1
+    GEMM_READY = 0
+    COMPACT = 1
 
 class NVTE_Activation_Type(IntEnum):
-    NVTE_GELU = 0
-    NVTE_GEGLU = 1
-    NVTE_SILU = 2
-    NVTE_SWIGLU = 3
-    NVTE_RELU = 4
-    NVTE_REGLU = 5
-    NVTE_QGELU = 6
-    NVTE_QGEGLU = 7
-    NVTE_SRELU = 8
-    NVTE_SREGLU = 9
+    GELU = 0
+    GEGLU = 1
+    SILU = 2
+    SWIGLU = 3
+    RELU = 4
+    REGLU = 5
+    QGELU = 6
+    QGEGLU = 7
+    SRELU = 8
+    SREGLU = 9
+    CLAMPED_SWIGLU = 10
 
 class NVTE_Softmax_Type(IntEnum):
     NVTE_VANILLA_SOFTMAX = 0
@@ -78,21 +83,19 @@ class NVTE_Mask_Type(IntEnum):
     NVTE_PADDING_CAUSAL_MASK = 3
     NVTE_CAUSAL_BOTTOM_RIGHT_MASK = 4
     NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK = 5
-    NVTE_ARBITRARY_MASK = 6
 
 class NVTE_Fused_Attn_Backend(IntEnum):
-    NVTE_No_Backend = 0
-    NVTE_F16_max512_seqlen = 1
-    NVTE_F16_arbitrary_seqlen = 2
-    NVTE_FP8 = 3
-    NVTE_FA3 = 4
+    NVTE_No_Backend = -1
+    NVTE_F16_max512_seqlen = 0
+    NVTE_F16_arbitrary_seqlen = 1
+    NVTE_FP8 = 2
 
 class NVTE_QKV_Format(IntEnum):
-    NVTE_BSHD = 0
-    NVTE_SBHD = 1
+    NVTE_SBHD = 0
+    NVTE_BSHD = 1
     NVTE_THD = 2
-    NVTE_SBHD_2BSHD = 3
-    NVTE_BSHD_2SBHD = 4
+    NVTE_BSHD_2SBHD = 3
+    NVTE_SBHD_2BSHD = 4
     NVTE_THD_2BSHD = 5
     NVTE_THD_2SBHD = 6
 

From 48c84801854c811bb07aa0ec92cbe144335546f6 Mon Sep 17 00:00:00 2001
From: ssuurrffaaccee <455013643@qq.com>
Date: Sun, 25 Jan 2026 15:08:48 +0800
Subject: [PATCH 29/59] add Vendor KUNLUNXIN (#27)

# Description

add Vendor KUNLUNXIN
---
 .../backends/vendor/kunlunxin/__init__.py     |   7 +
 .../vendor/kunlunxin/flash_attention.py       | 384 ++++++++++++++++++
 .../backends/vendor/kunlunxin/kunlunxin.py    |  23 ++
 .../backends/vendor/kunlunxin/register_ops.py |  48 +++
 transformer_engine/plugin/core/builtin_ops.py |   7 +
 5 files changed, 469 insertions(+)
 create mode 100644 transformer_engine/plugin/core/backends/vendor/kunlunxin/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py

diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/__init__.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/__init__.py
new file mode 100644
index 0000000000..aa2198ee35
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .kunlunxin import KunLunXinBackend
+
+__all__ = ["KunLunXinBackend"]
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
new file mode 100644
index 0000000000..7603553e42
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+
+
+class FlashAttentionTorch(FlashAttentionBase):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+    @property
+    def backend_name(self) -> str:
+        return "torch_sdpa"
+
+    def _convert_layout_to_bhsd(
+        self,
+        tensor: torch.Tensor,
+        layout: str,
+    ) -> torch.Tensor:
+        """Convert tensor from various layouts to [batch, heads, seq, dim] format."""
+        layout = layout.lower()
+
+        # Handle combined layouts like "sbhd_sbhd_sbhd" - extract the first part
+        if "_" in layout:
+            layout = layout.split("_")[0]
+
+        if layout in ("sbhd", "sbh3d", "sb3hd"):
+            return tensor.permute(1, 2, 0, 3)
+        elif layout in ("bshd", "bsh3d", "bs3hd"):
+            return tensor.permute(0, 2, 1, 3)
+        elif layout in ("bhsd",):
+            return tensor
+        elif layout in ("thd",):
+            # thd is packed format, should not reach here for 4D tensors
+            raise ValueError(f"thd layout requires 3D tensor, got {tensor.dim()}D")
+        else:
+            raise ValueError(f"Unsupported qkv_layout: {layout}")
+
+    def _convert_bhsd_to_layout(
+        self,
+        tensor: torch.Tensor,
+        layout: str,
+    ) -> torch.Tensor:
+        """Convert tensor from [batch, heads, seq, dim] back to original layout."""
+        layout = layout.lower()
+
+        # Handle combined layouts like "sbhd_sbhd_sbhd" - extract the first part
+        if "_" in layout:
+            layout = layout.split("_")[0]
+
+        if layout in ("sbhd", "sbh3d", "sb3hd"):
+            return tensor.permute(2, 0, 1, 3)
+        elif layout in ("bshd", "bsh3d", "bs3hd"):
+            return tensor.permute(0, 2, 1, 3)
+        elif layout in ("bhsd",):
+            return tensor
+        elif layout in ("thd",):
+            raise ValueError(f"thd layout requires 3D tensor, got {tensor.dim()}D")
+        else:
+            raise ValueError(f"Unsupported qkv_layout: {layout}")
+
+    def _create_sliding_window_mask(
+        self,
+        seq_len_q: int,
+        seq_len_kv: int,
+        window_size: Tuple[int, int],
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        """Create a sliding window attention mask."""
+        left_window, right_window = window_size
+
+        if left_window == -1 and right_window == -1:
+            return torch.zeros(seq_len_q, seq_len_kv, dtype=dtype, device=device)
+
+        q_idx = torch.arange(seq_len_q, device=device).unsqueeze(1)
+        kv_idx = torch.arange(seq_len_kv, device=device).unsqueeze(0)
+
+        mask_bool = torch.zeros(seq_len_q, seq_len_kv, dtype=torch.bool, device=device)
+
+        if left_window >= 0:
+            mask_bool = mask_bool | (kv_idx < q_idx - left_window)
+
+        if right_window >= 0:
+            mask_bool = mask_bool | (kv_idx > q_idx + right_window)
+
+        mask = torch.zeros(seq_len_q, seq_len_kv, dtype=dtype, device=device)
+        mask.masked_fill_(mask_bool, float('-inf'))
+
+        return mask
+
+    def _unpack_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Convert packed tensor to padded tensor format."""
+        batch_size = cu_seqlens.shape[0] - 1
+        device = tensor.device
+        original_shape = tensor.shape
+
+        if tensor.dim() == 4:
+            if tensor.shape[1] == 1:
+                tensor = tensor.squeeze(1)
+            else:
+                raise ValueError(
+                    f"Unexpected 4D tensor shape {original_shape}. "
+                    f"Expected [total_tokens, 1, num_heads, head_dim]"
+                )
+
+        if tensor.dim() != 3:
+            raise ValueError(
+                f"Expected tensor to be 3D or 4D after processing, got shape {original_shape}"
+            )
+
+        total_tokens, num_heads, head_dim = tensor.shape
+
+        expected_total = cu_seqlens[-1].item()
+        if total_tokens != expected_total:
+            raise ValueError(
+                f"Tensor has {total_tokens} tokens but cu_seqlens indicates {expected_total} tokens"
+            )
+
+        padded_tensor = torch.zeros(
+            batch_size, num_heads, max_seqlen, head_dim,
+            dtype=tensor.dtype, device=device
+        )
+
+        padding_mask = torch.ones(batch_size, max_seqlen, dtype=torch.bool, device=device)
+
+        for i in range(batch_size):
+            start = cu_seqlens[i].item()
+            end = cu_seqlens[i + 1].item()
+            seq_len = end - start
+
+            seq_data = tensor[start:end].permute(1, 0, 2)
+            padded_tensor[i, :, :seq_len, :] = seq_data
+            padding_mask[i, :seq_len] = False
+
+        return padded_tensor, padding_mask
+
+    def _pack_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Convert padded tensor back to packed tensor format."""
+        batch_size = tensor.shape[0]
+        num_heads = tensor.shape[1]
+        head_dim = tensor.shape[3]
+        total_tokens = cu_seqlens[-1].item()
+        device = tensor.device
+
+        packed_tensor = torch.zeros(
+            total_tokens, num_heads, head_dim,
+            dtype=tensor.dtype, device=device
+        )
+
+        for i in range(batch_size):
+            start = cu_seqlens[i].item()
+            end = cu_seqlens[i + 1].item()
+            seq_len = end - start
+
+            seq_data = tensor[i, :, :seq_len, :].permute(1, 0, 2)
+            packed_tensor[start:end, :, :] = seq_data
+
+        return packed_tensor
+
+    def _forward_impl(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        """Flash Attention implementation using PyTorch's scaled_dot_product_attention."""
+        if fp8:
+            raise NotImplementedError("FP8 is not supported in PyTorch SDPA backend")
+        if cp_group is not None:
+            raise NotImplementedError("Context parallelism is not supported in PyTorch SDPA backend")
+        if alibi_slopes is not None:
+            raise NotImplementedError("ALiBi slopes are not supported in PyTorch SDPA backend")
+
+        query_original_shape = query_layer.shape
+
+        # Check if input is in standard 4D format - same as flagos backend
+        # If tensor is 4D, treat it as standard format and just do layout conversion
+        # Only use unpack logic for true packed format (3D tensors with thd layout)
+        is_standard_4d = query_layer.dim() == 4
+
+        if is_standard_4d:
+            # Standard 4D tensor format - just convert layout like flagos does
+            query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+            key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+            value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+            use_packed_format = False
+            padding_mask_q = None
+            padding_mask_kv = None
+        else:
+            # True packed format (thd layout, 3D tensor) - use unpack logic
+            use_packed_format = cu_seqlens_q is not None or cu_seqlens_kv is not None
+            padding_mask_q = None
+            padding_mask_kv = None
+
+            if use_packed_format:
+                if cu_seqlens_q is not None:
+                    query, padding_mask_q = self._unpack_tensor(query_layer, cu_seqlens_q, max_seqlen_q)
+                else:
+                    query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+
+                if cu_seqlens_kv is not None:
+                    key, padding_mask_kv = self._unpack_tensor(key_layer, cu_seqlens_kv, max_seqlen_kv)
+                    value, _ = self._unpack_tensor(value_layer, cu_seqlens_kv, max_seqlen_kv)
+                else:
+                    key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+                    value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+            else:
+                query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
+                key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
+                value = self._convert_layout_to_bhsd(value_layer, qkv_layout)
+
+        batch_size, num_heads_q, seq_len_q, head_dim = query.shape
+        num_heads_kv = key.shape[1]
+        seq_len_kv = key.shape[2]
+
+        if num_heads_q != num_heads_kv:
+            num_groups = num_heads_q // num_heads_kv
+            if num_heads_q % num_heads_kv != 0:
+                raise ValueError(
+                    f"num_heads_q ({num_heads_q}) must be divisible by num_heads_kv ({num_heads_kv})"
+                )
+            key = key.repeat_interleave(num_groups, dim=1)
+            value = value.repeat_interleave(num_groups, dim=1)
+
+        attn_mask = None
+        is_causal = False
+
+        if use_packed_format and padding_mask_kv is not None:
+            attn_mask = torch.zeros(
+                batch_size, seq_len_q, seq_len_kv,
+                dtype=query.dtype, device=query.device
+            )
+            padding_broadcast = padding_mask_kv.unsqueeze(1)
+            attn_mask.masked_fill_(padding_broadcast, float('-inf'))
+
+        if attn_mask_type == "causal":
+            is_causal = True
+            attn_mask = None
+            # if window_size is None and not use_packed_format:
+            #     is_causal = True
+            # else:
+            #     causal_mask = torch.zeros(
+            #         seq_len_q, seq_len_kv,
+            #         dtype=query.dtype, device=query.device
+            #     )
+            #     causal_mask.masked_fill_(
+            #         torch.triu(torch.ones(seq_len_q, seq_len_kv, device=query.device, dtype=torch.bool), diagonal=1),
+            #         float('-inf')
+            #     )
+
+            #     if attn_mask is not None:
+            #         if attn_mask.dim() == 2:
+            #             attn_mask = attn_mask + causal_mask
+            #         else:
+            #             attn_mask = attn_mask + causal_mask.unsqueeze(0)
+            #     else:
+            #         attn_mask = causal_mask
+
+        if window_size is not None and not is_causal:
+            window_mask = self._create_sliding_window_mask(
+                seq_len_q=seq_len_q,
+                seq_len_kv=seq_len_kv,
+                window_size=window_size,
+                device=query.device,
+                dtype=query.dtype,
+            )
+
+            if attn_mask is not None:
+                attn_mask = attn_mask + window_mask.unsqueeze(0)
+            else:
+                attn_mask = window_mask
+
+        if attention_mask is not None and attn_mask_type != "causal":
+            if isinstance(attention_mask, tuple):
+                explicit_mask = attention_mask[0]
+            else:
+                explicit_mask = attention_mask
+
+            if explicit_mask.dtype == torch.bool:
+                float_mask = torch.zeros_like(explicit_mask, dtype=query.dtype)
+                float_mask.masked_fill_(~explicit_mask, float('-inf'))
+                explicit_mask = float_mask
+
+            if explicit_mask.dim() == 2:
+                explicit_mask = explicit_mask.unsqueeze(0).unsqueeze(0)
+            elif explicit_mask.dim() == 3:
+                explicit_mask = explicit_mask.unsqueeze(1)
+
+            if attn_mask is not None:
+                if attn_mask.dim() == 2:
+                    attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+                elif attn_mask.dim() == 3:
+                    attn_mask = attn_mask.unsqueeze(1)
+                attn_mask = attn_mask + explicit_mask
+            else:
+                attn_mask = explicit_mask
+        elif attn_mask is not None:
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+            elif attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(1)
+
+        with self.attention_dropout_ctx():
+            dropout_p = self.attention_dropout if self.training else 0.0
+
+            output = F.scaled_dot_product_attention(
+                query=query,
+                key=key,
+                value=value,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+                scale=self.softmax_scale,
+            )
+
+        if use_packed_format and padding_mask_q is not None:
+            mask_expanded = padding_mask_q.unsqueeze(1).unsqueeze(3)
+            output = output.masked_fill(mask_expanded, 0.0)
+
+        if use_packed_format and cu_seqlens_q is not None:
+            output = self._pack_tensor(output, cu_seqlens_q)
+
+            if len(query_original_shape) == 4:
+                total_tokens = output.shape[0]
+                hidden_size = output.shape[1] * output.shape[2]
+                output = output.contiguous().view(total_tokens, 1, hidden_size)
+        else:
+            output = self._convert_bhsd_to_layout(output, qkv_layout)
+            # Flatten the last two dimensions (heads, dim) -> (heads * dim)
+            # to match the output format of other backends
+            output = output.contiguous().view(*output.shape[:-2], -1)
+
+        return output
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
new file mode 100644
index 0000000000..55954cf423
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from transformer_engine.plugin.core.ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+
+
+class KunLunXinBackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return True
+
+    def is_available(self) -> bool:
+        return True
+
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionTorch
+        return FlashAttentionTorch
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
new file mode 100644
index 0000000000..10fa74bd31
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+KunLunXin backend operator registrations.
+
+This module registers all KunLunXin PyTorch implementations.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from transformer_engine.plugin.core.types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all KunLunXin PyTorch operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    from .kunlunxin import KunLunXinBackend
+
+    # Create a backend instance to access the methods
+    backend = KunLunXinBackend()
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.kunlunxin", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="KUNLUNXIN", priority=100),
+
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
index 7270173f4b..c2c10ece2e 100644
--- a/transformer_engine/plugin/core/builtin_ops.py
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -64,3 +64,10 @@ def register_builtins(registry: OpRegistry) -> None:
         # Metax may not be available, this is expected
         pass
 
+    # Register KUNLUNXIN (VENDOR) implementations
+    try:
+        from .backends.vendor.kunlunxin.register_ops import register_builtins as register_kunlunxin
+        register_kunlunxin(registry)
+    except Exception as e:
+        # KunLunXin may not be available, this is expected
+        pass
\ No newline at end of file

From de00a8acfa7c2b38b5afa8f6ef7e2565bac95183 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Mon, 26 Jan 2026 17:03:49 +0800
Subject: [PATCH 30/59] Fix the incorrect registration on Kunlunxin (#29)

Fix kunlunxin register errors
---
 .../backends/vendor/kunlunxin/kunlunxin.py    | 30 +++++++++++++++++--
 .../backends/vendor/kunlunxin/register_ops.py |  3 ++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
index 55954cf423..5d7da9e165 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 
 import os
+import subprocess
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -10,13 +11,38 @@
 from transformer_engine.plugin.core.ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
 
 
+def _check_kunlunxin_available() -> bool:
+    """Check if xpu-smi command can be executed successfully."""
+    try:
+        result = subprocess.run(
+            ["xpu-smi"],
+            capture_output=True,
+            timeout=5,
+            text=True
+        )
+        
+        if result.returncode == 0:
+            return True
+        else:
+            return False
+            
+    except subprocess.TimeoutExpired:
+        return False
+    except FileNotFoundError:
+        return False
+    except OSError as e:
+        return False
+    except Exception as e:
+        return False
+
+
 class KunLunXinBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
-        return True
+        return _check_kunlunxin_available()
 
     def is_available(self) -> bool:
-        return True
+        return _check_kunlunxin_available()
 
     def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionTorch
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
index 10fa74bd31..1585d0cf9d 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
@@ -35,6 +35,9 @@ def register_builtins(registry) -> None:
 
     # Create a backend instance to access the methods
     backend = KunLunXinBackend()
+    
+    if not backend.is_available():
+        return
 
     # Bind is_available to all methods
     is_avail = backend.is_available

From 35e18095963e98874ef578bad953e5f5ad082969 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Mon, 26 Jan 2026 19:05:06 +0800
Subject: [PATCH 31/59] Polish available check for kunlunxin (#30)

- Polish available check for kunlunxin
---
 .../backends/vendor/kunlunxin/kunlunxin.py    | 52 ++++++++++++-------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
index 5d7da9e165..6066a53892 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
@@ -10,29 +10,41 @@
 
 from transformer_engine.plugin.core.ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
 
+_kunlunxin_available = False
+
+def _ensure_kunlunxin_available():
+    global _kunlunxin_available
+    if not _kunlunxin_available:
+        try:
+            result = subprocess.run(
+                ["xpu-smi"],
+                capture_output=True,
+                timeout=10,
+                text=True
+            )
+            
+            if result.returncode == 0:
+                _kunlunxin_available = True
+            else:
+                _kunlunxin_available = False
+                
+        except subprocess.TimeoutExpired:
+            _kunlunxin_available = False
+        except FileNotFoundError:
+            _kunlunxin_available = False
+        except OSError as e:
+            _kunlunxin_available = False
+        except Exception as e:
+            _kunlunxin_available = False
+    
+    return _kunlunxin_available
+
 
 def _check_kunlunxin_available() -> bool:
     """Check if xpu-smi command can be executed successfully."""
-    try:
-        result = subprocess.run(
-            ["xpu-smi"],
-            capture_output=True,
-            timeout=5,
-            text=True
-        )
-        
-        if result.returncode == 0:
-            return True
-        else:
-            return False
-            
-    except subprocess.TimeoutExpired:
-        return False
-    except FileNotFoundError:
-        return False
-    except OSError as e:
-        return False
-    except Exception as e:
+    if _ensure_kunlunxin_available():
+        return True
+    else:
         return False
 
 

From 8690ab4c2ce3d1d046cc5ae60ab6a3308cb3f36b Mon Sep 17 00:00:00 2001
From: dinghaodhd <986165956@qq.com>
Date: Wed, 28 Jan 2026 21:24:43 +0800
Subject: [PATCH 32/59] Add new register op get_attention_backend for METAX
 (#31)

# Description

Add new register op get_attention_backend for METAX

Fixes # (issue)

## Type of change

- [ ] New feature (non-breaking change which adds functionality)

## Changes

Please list the changes introduced in this PR:

- Add register for get_attention_backend in register_ops.py
- Add implement of get_attention_backend in metax.py

# Checklist:

- [x] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [x] The functionality is complete
- [x] I have commented my code, particularly in hard-to-understand areas
- [x] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
---
 .../plugin/core/backends/vendor/metax/metax.py  | 17 +++++++++++++++++
 .../core/backends/vendor/metax/register_ops.py  |  2 ++
 2 files changed, 19 insertions(+)

diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 0baea24a2e..8efbbc9490 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -158,6 +158,23 @@ def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionMETAX
         return FlashAttentionMETAX
 
+    def get_attention_backend(self, attention_params=None):
+        # Import the metax get_attention_backend function
+        try:
+            from transformer_engine_metax.pytorch.attention.dot_product_attention import utils
+            return utils.get_attention_backend(attention_params)
+
+        except ImportError as e:
+            raise RuntimeError(
+                f"Failed to import metax FlashAttention: {e}. "
+                "Please ensure flash-attn is installed and transformer_engine_metax is available."
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get_attention_backend: {e}. "
+                f"Attention_params: {self.attention_params}"
+            )
+
     def quantize(
         self,
         tensor: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
index 10ccc83c99..a404bbbdc7 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
@@ -197,6 +197,8 @@ def register_builtins(registry) -> None:
 
         # FlashAttention class getter
         OpImpl(op_name="get_flash_attention_class", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="METAX", priority=100),
+                # Attention backend selection
+        OpImpl(op_name="get_attention_backend", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="METAX", priority=100),
     ]
 
     registry.register_many(impls)

From b0a5934ba74e8254294d41a5ec152705824dae6b Mon Sep 17 00:00:00 2001
From: DannyP0 <14259448+DannyP0@users.noreply.github.com>
Date: Thu, 5 Feb 2026 19:05:51 +0800
Subject: [PATCH 33/59] [iluvatar]add vendor/iluvatar backend (#35)

# Description

[iluvatar]add vendor/iluvatar backend
---
 .../core/backends/vendor/iluvatar/__init__.py |    7 +
 .../core/backends/vendor/iluvatar/iluvatar.py | 1109 +++++++++++++++++
 .../backends/vendor/iluvatar/register_ops.py  |  205 +++
 transformer_engine/plugin/core/builtin_ops.py |    8 +
 4 files changed, 1329 insertions(+)
 create mode 100644 transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py

diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py
new file mode 100644
index 0000000000..ebf1092308
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .iluvatar import IluvatarBackend
+
+__all__ = ["IluvatarBackend"]
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
new file mode 100644
index 0000000000..5013fa7c23
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -0,0 +1,1109 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import math
+import torch
+
+from ....ops import TEFLBackendBase, FP8TensorMeta
+
+
+def _load_iluvatar_libs():
+    import ctypes
+    import os
+    import subprocess
+    from pathlib import Path
+    import importlib.util
+    import sysconfig
+    import platform
+    import glob as glob_module
+
+    def get_ext():
+        system = platform.system()
+        return ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
+
+    ext = get_ext()
+
+    def try_load_lib(name, search_patterns):
+        for env_var in [f"{name.upper()}_HOME", f"{name.upper()}_PATH"]:
+            path = os.environ.get(env_var)
+            if path:
+                libs = glob_module.glob(f"{path}/**/lib{name}{ext}*", recursive=True)
+                if libs:
+                    libs.sort(reverse=True, key=os.path.basename)
+                    try:
+                        return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                    except:
+                        pass
+
+        cuda_home = os.environ.get("IX_HOME") or os.environ.get("IX_PATH") or "/usr/local/corex"
+        for pattern in search_patterns:
+            libs = glob_module.glob(f"{cuda_home}/**/{pattern}", recursive=True)
+            if libs:
+                libs.sort(reverse=True, key=os.path.basename)
+                try:
+                    return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                except:
+                    pass
+
+        try:
+            result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
+            for line in result.decode().split('\n'):
+                if f"lib{name}" in line and "=>" in line:
+                    so_path = line.split(">")[1].strip()
+                    if so_path:
+                        return ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL)
+        except:
+            pass
+
+        try:
+            return ctypes.CDLL(f"lib{name}{ext}", mode=ctypes.RTLD_GLOBAL)
+        except:
+            return None
+
+    try:
+        try_load_lib("cudnn", [f"libcudnn{ext}*"])
+        try_load_lib("nvrtc", [f"libnvrtc{ext}*"])
+        try_load_lib("curand", [f"libcurand{ext}*"])
+
+        te_path = Path(importlib.util.find_spec("transformer_engine_iluvatar").origin).parent.parent
+        for search_dir in [te_path, te_path / "transformer_engine_iluvatar/libs"]:
+            if search_dir.exists():
+                matches = list(search_dir.glob(f"libixte_common{ext}*"))
+                if matches:
+                    ctypes.CDLL(str(matches[0]), mode=ctypes.RTLD_GLOBAL)
+                    return True
+        return False
+    except Exception as e:
+        print(f"[ILUVATAR] Failed to load ILUVATAR libs: {e}")
+        return False
+
+_iluvatar_libs_loaded = False
+
+def _ensure_iluvatar_libs():
+    global _iluvatar_libs_loaded
+    if not _iluvatar_libs_loaded:
+        _iluvatar_libs_loaded = _load_iluvatar_libs()
+    return _iluvatar_libs_loaded
+
+def _check_iluvatar_available() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    import os
+    try:
+        if not _ensure_iluvatar_libs():
+           return False
+        import transformer_engine_iluvatar
+        return True
+    except (ImportError, OSError) as e:
+        print(f"[ILUVATAR] Import failed: {e}")
+        return False
+
+def _get_tex():
+    import transformer_engine_iluvatar.pytorch.ixte_torch
+    return transformer_engine_iluvatar.pytorch.ixte_torch
+
+def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
+    if torch_dtype is None:
+        return None
+
+    NativeDType = tex_module.DType
+    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_iluvatar.pytorch.ixte_torch':
+        return torch_dtype
+
+    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
+        from transformer_engine.plugin.core.ops import DType as PyDType
+        if isinstance(torch_dtype, PyDType):
+            dtype_name = torch_dtype.name
+            if hasattr(NativeDType, dtype_name):
+                return getattr(NativeDType, dtype_name)
+
+    dtype_map = {
+        torch.uint8: NativeDType.kByte,
+        torch.float8_e4m3fn: NativeDType.kFloat8E4M3,
+        torch.float8_e5m2: NativeDType.kFloat8E5M2,
+        torch.int32: NativeDType.kInt32,
+        torch.float32: NativeDType.kFloat32,
+        torch.half: NativeDType.kFloat16,
+        torch.bfloat16: NativeDType.kBFloat16,
+    }
+
+    return dtype_map.get(torch_dtype, torch_dtype)
+
+def _convert_dtype_params(func):
+    import functools
+    import inspect
+    import os
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        dtype_params = ['otype', 'output_dtype', 'bias_type']
+
+        from transformer_engine.plugin.core.ops import DType as PyDType
+
+        def needs_conversion(val):
+            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
+
+        for param_name in dtype_params:
+            if param_name in kwargs:
+                value = kwargs[param_name]
+                if needs_conversion(value):
+                    converted = self._to_te_dtype(value)
+                    kwargs[param_name] = converted
+
+        sig = inspect.signature(func)
+        param_names = list(sig.parameters.keys())[1:]
+
+        args_list = list(args)
+        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
+            if param_name in dtype_params and needs_conversion(arg_value):
+                converted = self._to_te_dtype(arg_value)
+                args_list[i] = converted
+
+        return func(self, *args_list, **kwargs)
+
+    return wrapper
+
+class IluvatarBackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return _check_iluvatar_available()
+
+    def __init__(self):
+        self._tex = None
+
+    def _get_tex(self):
+        if self._tex is None:
+            self._tex = _get_tex()
+        return self._tex
+
+    def _to_te_dtype(self, torch_dtype):
+        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
+
+    def is_available(self) -> bool:
+        return _check_iluvatar_available()
+    
+    def get_flash_attention_class(self):
+        raise NotImplementedError("get_flash_attention_class - not implemented in iluvatar backend")
+
+    def get_attention_backend(self, attention_params=None):
+        raise NotImplementedError("get_attention_backend - not implemented in iluvatar backend")
+    
+    def quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        output: Optional[torch.Tensor] = None,
+        noop: Optional[torch.Tensor] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.quantize(tensor, quantizer, output, noop)
+
+    @_convert_dtype_params
+    def dequantize(
+        self,
+        input: torch.Tensor,
+        otype: torch.dtype,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dequantize(input, otype)
+
+    def bgrad_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.bgrad_quantize(input, quantizer)
+
+    @_convert_dtype_params
+    def generic_gemm(
+        self,
+        A: torch.Tensor,
+        transA: bool,
+        B: torch.Tensor,
+        transB: bool,
+        D: torch.Tensor,
+        quantizer: Any,
+        output_dtype: torch.dtype,
+        bias: Optional[torch.Tensor],
+        bias_type: Any,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[Any] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> Any:
+        # Check shape
+        tex = self._get_tex()
+
+        if bias_type is None:
+            bias_type = self._to_te_dtype(torch.bfloat16)
+
+        return tex.generic_gemm(
+            A, transA, B, transB, D, quantizer, output_dtype,
+            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
+            accumulate, use_split_accumulator, comm_overlap, comm_type,
+            extra_output, bulk_overlap, alpha, beta
+        )
+
+    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm(*args, **kwargs)
+
+    def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.gelu(input, quantizer)
+
+    def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.geglu(input, quantizer)
+
+    def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgelu(input, quantizer)
+
+    def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgeglu(input, quantizer)
+
+    def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.relu(input, quantizer)
+
+    def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.reglu(input, quantizer)
+
+    def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.srelu(input, quantizer)
+
+    def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.sreglu(input, quantizer)
+
+    def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.silu(input, quantizer)
+
+    def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.swiglu(input, quantizer)
+
+    def clamped_swiglu(
+            self,
+            input: torch.Tensor,
+            quantizer: Any,
+            limit: float = 7.0,
+            alpha: float = 1.702,
+        ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
+    def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgelu(grad, fwd_input, quantizer)
+
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgeglu(grad, fwd_input, quantizer)
+
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgelu(grad, fwd_input, quantizer)
+
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgeglu(grad, fwd_input, quantizer)
+
+    def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.drelu(grad, fwd_input, quantizer)
+
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dreglu(grad, fwd_input, quantizer)
+
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsrelu(grad, fwd_input, quantizer)
+
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsreglu(grad, fwd_input, quantizer)
+
+    def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsilu(grad, fwd_input, quantizer)
+
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dswiglu(grad, fwd_input, quantizer)
+
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_drelu(grad, fwd_input, quantizer)
+
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dqgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsrelu(grad, fwd_input, quantizer)
+
+    @_convert_dtype_params
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, mu, rsigma = tex.layernorm_fwd(
+            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+        return y, mu, rsigma
+
+    def layernorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dgamma, dbeta
+
+    @_convert_dtype_params
+    def rmsnorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        ln_out: Optional[torch.Tensor],
+        quantizer: Any,
+        otype: torch.dtype,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = input.shape
+        if input.ndim > 2:
+            input = input.view(-1, input.shape[-1])
+
+        y, y_quant, rsigma = tex.rmsnorm_fwd(
+            input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+        if len(orig_shape) > 2:
+            y = y.view(*orig_shape)
+            if y_quant is not None:
+                y_quant = y_quant.view(*orig_shape)
+        return y, y_quant, rsigma
+
+    def rmsnorm_bwd(
+        self,
+        dy: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int = 0,
+        zero_centered_gamma: bool = False,
+        eps: float = 1e-5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+
+        orig_shape = dy.shape
+        if dy.ndim > 2:
+            dy = dy.view(-1, dy.shape[-1])
+            x = x.view(-1, x.shape[-1])
+
+        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+        if len(orig_shape) > 2:
+            dx = dx.view(*orig_shape)
+        return dx, dw
+
+    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.rmsnorm_bwd_add(*args, **kwargs)
+
+    def multi_tensor_quantize(
+        self,
+        tensor_list: List[torch.Tensor],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
+    def split_quantize(
+        self,
+        tensor: torch.Tensor,
+        split_sections: List[int],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.split_quantize(tensor, split_sections, quantizer_list)
+
+    def moe_permute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex._moe_permute_fwd(*args, **kwargs)
+    
+    def moe_permute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex._moe_permute_bwd(*args, **kwargs)
+
+    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex._moe_unpermute_fwd(*args, **kwargs)
+
+    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex._moe_unpermute_bwd(*args, **kwargs)
+
+    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+
+    def scaled_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale)
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
+
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)  
+
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
+
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad: torch.Tensor,
+        softmax_output: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
+
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        tex = self._get_tex()
+
+        args_list = list(args)
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+
+            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
+                return py_enum
+
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+
+            if hasattr(py_enum, 'value'):
+                enum_value = int(py_enum.value)
+                for member_name in dir(native_enum_class):
+                    if not member_name.startswith('_'):
+                        try:
+                            member = getattr(native_enum_class, member_name)
+                            if hasattr(member, 'value') and int(member.value) == enum_value:
+                                return member
+                        except:
+                            pass
+
+            if hasattr(py_enum, 'value'):
+                return int(py_enum.value)
+
+            return py_enum
+
+        if len(args) > 1:
+            args_list[1] = self._to_te_dtype(args[1])
+        if len(args) > 2:
+            args_list[2] = self._to_te_dtype(args[2])
+        if len(args) > 3:
+            args_list[3] = convert_enum(args[3], tex.NVTE_QKV_Layout)
+        if len(args) > 4:
+            args_list[4] = convert_enum(args[4], tex.NVTE_Bias_Type)
+        if len(args) > 5:
+            args_list[5] = convert_enum(args[5], tex.NVTE_Mask_Type)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_Softmax_Type)
+
+        return tex.get_fused_attn_backend(*args_list, **kwargs)
+
+    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
+                return py_enum
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+            return py_enum
+
+        args_list = list(args)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_QKV_Layout)
+        if len(args) > 7:
+            args_list[7] = convert_enum(args[7], tex.NVTE_Bias_Type)
+        if len(args) > 8:
+            args_list[8] = convert_enum(args[8], tex.NVTE_Mask_Type)
+        if len(args) > 9:
+            args_list[9] = convert_enum(args[9], tex.NVTE_Softmax_Type)
+
+        return tex.fused_attn_fwd(*args_list, **kwargs)
+
+    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+
+        def convert_enum(py_enum, native_enum_class):
+            if py_enum is None:
+                return None
+            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
+                return py_enum
+            if hasattr(py_enum, 'name'):
+                enum_name = py_enum.name
+                if hasattr(native_enum_class, enum_name):
+                    return getattr(native_enum_class, enum_name)
+            return py_enum
+
+        args_list = list(args)
+        if len(args) > 5:
+            args_list[5] = convert_enum(args[5], tex.NVTE_QKV_Layout)
+        if len(args) > 6:
+            args_list[6] = convert_enum(args[6], tex.NVTE_Bias_Type)
+        if len(args) > 7:
+            args_list[7] = convert_enum(args[7], tex.NVTE_Mask_Type)
+        if len(args) > 8:
+            args_list[8] = convert_enum(args[8], tex.NVTE_Softmax_Type)
+        if len(args) > 19:
+            args_list[19] = self._to_te_dtype(args[19])
+
+        if 'dqkv_dtype' in kwargs:
+            kwargs['dqkv_dtype'] = self._to_te_dtype(kwargs['dqkv_dtype'])
+
+        return tex.fused_attn_bwd(*args_list, **kwargs)
+
+    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_fwd(*args, **kwargs)
+
+    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fa_prepare_bwd(*args, **kwargs)
+
+    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.copy_to_kv_cache(*args, **kwargs)
+
+    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_thd_to_bshd(*args, **kwargs)
+
+    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.convert_bshd_to_thd(*args, **kwargs)
+
+    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        assert args[2] is None, "[Iluvatar] fused_rope_forward does not support start_position now."
+        assert args[3].name == "NVTE_SBHD", f"[Iluvatar] fused_rope_forward expect NVTE_SBHD, but got {args[3].name}."
+        tex = self._get_tex()
+        return tex.fused_rope_forward(args[0], args[1], False, False, 1.0)
+
+    def fused_rope_backward(self, *args, **kwargs) -> Any:
+        assert args[2].name == "NVTE_SBHD", f"[Iluvatar] fused_rope_backward expect NVTE_SBHD, but got {args[2].name}."
+        tex = self._get_tex()
+        return tex.fused_rope_backward(args[0], args[1], False, False, 1.0)
+
+    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_forward(*args, **kwargs)
+
+    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_qkv_rope_backward(*args, **kwargs)
+
+    def fused_topk_with_score_function_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        num_groups: int,
+        group_topk: int,
+        scaling_factor: float,
+        score_function: Any,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_fwd(
+            logits, topk, use_pre_softmax, num_groups, group_topk,
+            scaling_factor, score_function, expert_bias
+        )
+
+    def fused_topk_with_score_function_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        routing_map: torch.Tensor,
+        intermediate_output: torch.Tensor,
+        grad_probs: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        scaling_factor: float,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_bwd(
+            num_tokens, num_experts, routing_map, intermediate_output,
+            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+        )
+
+    def fused_score_for_moe_aux_loss_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
+
+    def fused_score_for_moe_aux_loss_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        intermediate_output: torch.Tensor,
+        grad_scores: torch.Tensor,
+        topk: int,
+        score_function: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_bwd(
+            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+        )
+
+    def fused_moe_aux_loss_fwd(
+        self,
+        probs: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        total_num_tokens: int,
+        num_experts: int,
+        num_rows: int,
+        num_cols: int,
+        topk: int,
+        coeff: float,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_fwd(
+            probs, tokens_per_expert, total_num_tokens, num_experts,
+            num_rows, num_cols, topk, coeff
+        )
+
+    def fused_moe_aux_loss_bwd(
+        self,
+        Const_buf: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        num_rows: int,
+        num_cols: int,
+        grad_aux_loss: torch.Tensor,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
+
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.dropout_fwd(input, dropout_probability, out)
+
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
+
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: Any,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_transpose(input, dtype, out=out)
+
+    def swap_first_dims(
+        self,
+        tensor: torch.Tensor,
+        *,
+        out: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.swap_first_dims(tensor, out=out)
+
+    def compute_amax(
+        self,
+        input: torch.Tensor,
+        amax: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        tex.compute_amax(input, amax)
+
+    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
+
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
+
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: Any,
+    ) -> None:
+        tex = self._get_tex()
+        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
+
+    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_padding(*args, **kwargs)
+
+    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(*args, **kwargs)
+
+    def get_cublasLt_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cublasLt_version()
+
+    def get_cudnn_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cudnn_version()
+
+    def get_num_cublas_streams(self) -> int:
+        tex = self._get_tex()
+        return tex.get_num_cublas_streams()
+
+    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_half_tensor(*args, **kwargs)
+
+    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_second_half_lse_correction(*args, **kwargs)
+
+    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_read_second_half_lse(*args, **kwargs)
+
+    def thd_out_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_out_correction(*args, **kwargs)
+
+    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_grad_correction(*args, **kwargs)
+
+    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        tex = self._get_tex()
+        return tex.thd_get_partitioned_indices(*args, **kwargs)
+
+    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.init_nvshmem_backend(*args, **kwargs)
+
+    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.create_nvshmem_tensor(*args, **kwargs)
+
+    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_send_on_current_stream(*args, **kwargs)
+
+    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_wait_on_current_stream(*args, **kwargs)
+
+    def nvshmem_finalize(self) -> None:
+        tex = self._get_tex()
+        tex.nvshmem_finalize()
+
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+        per_tensor: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int = None,
+        noop_flag: torch.Tensor = None,
+        tensor_lists: List[List[torch.Tensor]] = None,
+        lr: float = None,
+        beta1: float = None,
+        beta2: float = None,
+        eps: float = None,
+        step: int = None,
+        mode: int = None,
+        bias_correction: int = None,
+        weight_decay: float = None,
+    ):
+        tex = self._get_tex()
+        if chunk_size is None:
+            return tex.multi_tensor_adam
+        tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
+            eps, step, mode, bias_correction, weight_decay
+        )
+
+    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
+
+    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_fp8(*args, **kwargs)
+
+    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable(*args, **kwargs)
+
+    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
+
+    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_sgd(*args, **kwargs)
+
+    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+
+    def bulk_overlap_ag_with_external_gemm(
+        self,
+        allgather_communicator: Any,
+        send_stream: Any,
+        recv_stream: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        tex = self._get_tex()
+        return tex.FP8TensorMeta()
+
+    def create_comm_overlap_helper(
+        self,
+        world_group: Optional[Any] = None,
+        intra_node_group: Optional[Any] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        if world_group is None:
+            return tex.CommOverlapHelper()
+        return tex.CommOverlapHelper(world_group, intra_node_group)
+
+    def create_comm_overlap(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        num_splits: int = 3,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 2,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 16,
+        set_sm_margin: bool = True,
+        atomic_gemm: bool = False,
+        rs_overlap_first_gemm: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlap(
+            buffer_shape, buffer_dtype, helper, tp_size,
+            num_splits, num_max_streams, comm_cga_size,
+            gemm_priority, comm_priority, num_comm_sm,
+            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+        )
+
+    def create_comm_overlap_p2p(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        comm_type: Any,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 1,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 1,
+        set_sm_margin: bool = False,
+        atomic_gemm: bool = False,
+        use_ce: bool = True,
+        aggregate: bool = False,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.CommOverlapP2P(
+            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
+            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
+            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+        )
+
+    
+    
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
new file mode 100644
index 0000000000..b136be2a51
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+Iluvatar vendor backend operator registrations.
+
+This module registers all VENDOR (Iluvatar) implementations from transformer_engine_torch.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ....types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all Iluvatar (VENDOR) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    # Import Iluvatar backend to get all the wrapped tex functions
+    from .iluvatar import IluvatarBackend
+
+    # Create a backend instance to access the methods
+    backend = IluvatarBackend()
+
+    # Check if Iluvatar is available before registering
+    if not backend.is_available():
+        return
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # Normalization
+        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="layernorm_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="layernorm_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="Iluvatar", priority=100),
+
+        # GEMM
+        OpImpl(op_name="generic_gemm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="Iluvatar", priority=100),
+
+        # Quantization
+        OpImpl(op_name="quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dequantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="bgrad_quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="split_quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="Iluvatar", priority=100),
+
+        # Activations - Forward
+        OpImpl(op_name="gelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="geglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="qgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="qgeglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="relu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="reglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="srelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="sreglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="silu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="swiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="clamped_swiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="Iluvatar", priority=100),
+
+        # Activations - Backward
+        OpImpl(op_name="dgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dgeglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dqgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dqgeglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="drelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dreglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dsrelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dsreglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dsilu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dswiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="Iluvatar", priority=100),
+
+        # Activations - Bias + Backward
+        OpImpl(op_name="dbias_dgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dbias_dsilu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dbias_drelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="Iluvatar", priority=100),
+
+        # Softmax
+        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
+
+        # MOE operations
+        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="Iluvatar", priority=100),
+
+        # Fused attention
+        OpImpl(op_name="get_fused_attn_backend", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_attn_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_attn_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="Iluvatar", priority=100),
+
+        # KV cache
+        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="Iluvatar", priority=100),
+
+        # Tensor format conversions
+        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="Iluvatar", priority=100),
+
+        # RoPE (Rotary Position Embedding)
+        OpImpl(op_name="fused_rope_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_rope_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="Iluvatar", priority=100),
+
+        # TopK and MOE aux loss
+        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="Iluvatar", priority=100),
+
+        # Dropout
+        OpImpl(op_name="dropout_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="dropout_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="Iluvatar", priority=100),
+
+        # FP8 operations
+        OpImpl(op_name="fp8_transpose", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="swap_first_dims", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="compute_amax", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="Iluvatar", priority=100),
+
+        # Padding operations
+        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="Iluvatar", priority=100),
+
+        # Library version getters
+        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="get_cudnn_version", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="Iluvatar", priority=100),
+
+        # THD (Tensor, Hidden, Dimension) operations
+        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="thd_out_correction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="thd_grad_correction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="Iluvatar", priority=100),
+
+        # NVSHMEM operations
+        OpImpl(op_name="init_nvshmem_backend", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="create_nvshmem_tensor", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="nvshmem_finalize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor="Iluvatar", priority=100),
+
+        # Multi-tensor operations
+        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="Iluvatar", priority=100),
+
+        # Communication overlap operations
+        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="create_comm_overlap", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="Iluvatar", priority=100),
+
+        # FlashAttention class getter
+        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="Iluvatar", priority=100),
+
+        # Attention backend selection
+        OpImpl(op_name="get_attention_backend", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="Iluvatar", priority=100),
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
index c2c10ece2e..0937a3649e 100644
--- a/transformer_engine/plugin/core/builtin_ops.py
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -70,4 +70,12 @@ def register_builtins(registry: OpRegistry) -> None:
         register_kunlunxin(registry)
     except Exception as e:
         # KunLunXin may not be available, this is expected
+        pass
+    
+    # Register Iluvatar (VENDOR) implementations
+    try:
+        from .backends.vendor.iluvatar.register_ops import register_builtins as register_iluvatar
+        register_iluvatar(registry)
+    except Exception as e:
+        # Iluvatar may not be available, this is expected
         pass
\ No newline at end of file

From 12b2077827e0cffa4296b79de33c5e0cac4432bd Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Tue, 10 Feb 2026 17:33:25 +0800
Subject: [PATCH 34/59] Fix: Resolve parameter mismatch between TE_FL and NVTE
 functions (#34)

# Description
Align TE_FL backend interface signatures with the upstream NVTE (NVIDIA
TransformerEngine) C++ pybind API, to
resolve parameter mismatches that cause runtime failures.
---
 .../plugin/core/backends/flagos/flagos.py     |  144 +-
 .../core/backends/flagos/impl/fused_adam.py   |    2 +-
 .../core/backends/flagos/impl/multi_tensor.py |    2 +-
 .../core/backends/flagos/impl/rmsnorm.py      |    2 +-
 .../backends/reference/impl/normalization.py  |   29 +-
 .../core/backends/reference/impl/optimizer.py |    2 +-
 .../core/backends/reference/impl/rmsnorm.py   |    1 -
 .../core/backends/reference/reference.py      |  536 +++---
 .../core/backends/reference/register_ops.py   |   80 +-
 .../plugin/core/backends/vendor/cuda/cuda.py  | 1437 +++++++++-------
 .../core/backends/vendor/hygon/hygon.py       | 1341 +++++++++------
 .../core/backends/vendor/iluvatar/iluvatar.py | 1489 ++++++++++-------
 .../backends/vendor/kunlunxin/kunlunxin.py    |    4 +-
 .../core/backends/vendor/metax/metax.py       | 1429 ++++++++++------
 transformer_engine/plugin/core/ops.py         | 1328 ++++++++-------
 .../plugin/tests/test_normalization.py        |    7 +-
 .../plugin/tests/test_operations.py           |   11 +-
 .../plugin/tests/test_optimizer.py            |  156 +-
 .../pytorch/module/layernorm_linear.py        |    2 -
 .../pytorch/ops/basic/rmsnorm.py              |    1 -
 .../pytorch/optimizers/__init__.py            |    2 +-
 .../pytorch/optimizers/fused_adam.py          |    8 +-
 22 files changed, 4796 insertions(+), 3217 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index ecdc73b33a..03f7c2ed7e 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from ...ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+from ...ops import *
 
 from .impl import (
     rmsnorm_fwd_fl, rmsnorm_bwd_fl,
@@ -20,7 +20,6 @@
 def _check_flagos_available() -> bool:
     return True
 
-
 class FlagOSBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -29,10 +28,6 @@ def check_available() -> bool:
     def is_available(self) -> bool:
         return _check_flagos_available()
 
-    def get_flash_attention_class(self):
-        from .attention.dot_product_attention.backends import FlashAttentionFL
-        return FlashAttentionFL
-
     def get_attention_backend(self, attention_params=None):
         from packaging.version import Version as PkgVersion
         from ...logger_manager import get_logger
@@ -65,17 +60,18 @@ def get_attention_backend(self, attention_params=None):
             available_backends,
         )
 
+##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -84,53 +80,53 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
+    ) -> List[Any]:
         return generic_gemm_fl(
             A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad,
-            workspace, workspace_size, accumulate, use_split_accumulator,
-            comm_overlap=comm_overlap, comm_type=comm_type,
-            extra_output=extra_output, bulk_overlap=bulk_overlap,
-            alpha=alpha, beta=beta
+            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
+            accumulate, use_split_accumulator, comm_overlap, comm_type,
+            extra_output, bulk_overlap, alpha, beta
         )
 
+    # Other granular functions
     def rmsnorm_fwd(
         self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
+        input: Any,
+        weight: Any,
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
         return rmsnorm_fwd_fl(
             input=input, weight=weight, eps=eps, ln_out=ln_out,
             quantizer=quantizer, odtype=otype,
             sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma,
         )
-
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         return rmsnorm_bwd_fl(
-            dy=dy, x=x, rsigma=rsigma, gamma=gamma,
-            sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma, eps=eps,
+            dy=dz, x=x, rsigma=rsigma, gamma=gamma,
+            sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma
         )
+    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        return NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
+    # multi-tensor functions
     def multi_tensor_scale(
         self,
         chunk_size: int,
@@ -139,73 +135,61 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         return multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale)
-
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
-        result, _ = multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor)
-        return result
-
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor)
     def multi_tensor_adam(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
-        if chunk_size is None:
-            return multi_tensor_adam_fl
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         return multi_tensor_adam_fl(
-            chunk_size=chunk_size, noop_flag=noop_flag, tensor_lists=tensor_lists,
-            lr=lr, beta1=beta1, beta2=beta2, eps=eps,
-            step=step, mode=mode, bias_correction=bias_correction, weight_decay=weight_decay,
+            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
         )
-
     def multi_tensor_adam_param_remainder(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
-        if chunk_size is None:
-            return multi_tensor_adam_param_remainder_fl
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         return multi_tensor_adam_param_remainder_fl(
-            chunk_size=chunk_size, noop_flag=noop_flag, tensor_lists=tensor_lists,
-            lr=lr, beta1=beta1, beta2=beta2, eps=eps,
-            step=step, mode=mode, bias_correction=bias_correction, weight_decay=weight_decay,
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
         )
 
+    # Misc
     def get_cublasLt_version(self) -> int:
         return 110000
-
     def get_cudnn_version(self) -> int:
         return 90000
-
     def get_num_cublas_streams(self) -> int:
         return 0
 
-    def get_fused_attn_backend(self, *args, **kwargs) -> int:
-        return NVTE_Fused_Attn_Backend.NVTE_No_Backend
-
-    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
-        return FP8TensorMeta()
-
+############## class func #################################
+    def get_flash_attention_class(self):
+        from .attention.dot_product_attention.backends import FlashAttentionFL
+        return FlashAttentionFL
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index 93ba067e93..89107b04c2 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -187,4 +187,4 @@ def multi_tensor_adam_param_remainder_fl(
 
         # Write back
         flag_gems.copy_(p, param_bf16)
-        flag_gems.copy_(p_remainder, remainder_int16)
+        flag_gems.copy_(p_remainder, remainder_int16)
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
index 4421487ff1..d7361fd7ed 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -23,4 +23,4 @@ def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *ar
 def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
 
     for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-        flag_gems.copy_(dst, src * scale)
+        flag_gems.copy_(dst, src * scale)
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
index ffa382147f..12fda567ed 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/rmsnorm.py
@@ -42,7 +42,7 @@ def rmsnorm_bwd_fl(
     gamma,
     sm_margin,
     zero_centered_gamma,
-    eps,
+    eps=1e-5,
 ):
     # When zero_centered_gamma is True, forward uses (1 + gamma) as weight
     # So backward needs to use (1 + gamma) for computing dx
diff --git a/transformer_engine/plugin/core/backends/reference/impl/normalization.py b/transformer_engine/plugin/core/backends/reference/impl/normalization.py
index 6ab7a7648c..48f89b44d8 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/normalization.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/normalization.py
@@ -5,12 +5,37 @@
 from typing import Any, Optional, Tuple
 import torch
 import torch.nn.functional as F
+from ....ops import DType
 
 __all__ = [
     "layernorm_fwd_torch",
     "layernorm_bwd_torch",
 ]
 
+# Mapping from DType enum to torch.dtype
+_DTYPE_TO_TORCH_DTYPE = {
+    DType.kByte: torch.uint8,
+    DType.kInt16: torch.int16,
+    DType.kInt32: torch.int32,
+    DType.kInt64: torch.int64,
+    DType.kFloat32: torch.float32,
+    DType.kFloat16: torch.float16,
+    DType.kBFloat16: torch.bfloat16,
+    DType.kFloat8E4M3: torch.float8_e4m3fn,
+    DType.kFloat8E5M2: torch.float8_e5m2,
+}
+
+def _to_torch_dtype(dtype):
+    """Convert DType enum to torch.dtype."""
+    if dtype is None:
+        return None
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, (int, DType)):
+        dtype_enum = DType(dtype)
+        if dtype_enum in _DTYPE_TO_TORCH_DTYPE:
+            return _DTYPE_TO_TORCH_DTYPE[dtype_enum]
+    raise ValueError(f"Unsupported dtype: {dtype}")
 
 def layernorm_fwd_torch(
     input: torch.Tensor,
@@ -19,10 +44,11 @@ def layernorm_fwd_torch(
     eps: float,
     ln_out: Optional[torch.Tensor],
     quantizer: Any,
-    odtype: torch.dtype,
+    odtype: DType,
     sm_margin: int,
     zero_centered_gamma: bool,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    odtype = _to_torch_dtype(odtype)
     mean = input.mean(dim=-1, keepdim=True)
     var = input.var(dim=-1, keepdim=True, unbiased=False)
     rsigma = torch.rsqrt(var + eps)
@@ -45,7 +71,6 @@ def layernorm_fwd_torch(
 
     return output, mean, rsigma
 
-
 def layernorm_bwd_torch(
     dy: torch.Tensor,
     x: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
index 0ae0809dcc..f3140a5695 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
@@ -310,4 +310,4 @@ def multi_tensor_compute_scale_and_scale_inv_torch(
 
         # Update scale and scale_inv
         scale.copy_(computed_scale)
-        scale_inv.copy_(1.0 / computed_scale)
+        scale_inv.copy_(1.0 / computed_scale)
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py b/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py
index 7ae420e7f3..0aebdae2fe 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/rmsnorm.py
@@ -43,7 +43,6 @@ def rmsnorm_bwd_torch(
     gamma,
     sm_margin,
     zero_centered_gamma,
-    eps,
 ):
     inv_rms = rsigma.unsqueeze(-1)
 
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
index 3f29cf89be..80c7b327f0 100644
--- a/transformer_engine/plugin/core/backends/reference/reference.py
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -3,11 +3,9 @@
 # See LICENSE for license information.
 
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union
-
+from typing import Any, List, Optional, Tuple
 import torch
-
-from ...ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+from ...ops import *
 
 from .impl import (
     general_gemm_torch,
@@ -33,6 +31,7 @@
     multi_tensor_sgd_torch,
 )
 
+
 class ReferenceBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -41,11 +40,7 @@ def check_available() -> bool:
     def is_available(self) -> bool:
         return True
 
-    def get_flash_attention_class(self):
-        from .flash_attention import FlashAttentionTorch
-        return FlashAttentionTorch
-
-    def get_attention_backend(self, attention_params=None):
+    def get_attention_backend(self, _attention_params=None):
         from packaging.version import Version as PkgVersion
         from ...logger_manager import get_logger
         logger = get_logger()
@@ -79,15 +74,15 @@ def get_attention_backend(self, attention_params=None):
 
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -96,49 +91,20 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
+    ) -> List[Any]:
         return general_gemm_torch(
-            A=A,
-            transA=transA,
-            B=B,
-            transB=transB,
-            D=D,
-            quantizer=quantizer,
-            output_dtype=output_dtype,
-            bias=bias,
-            bias_type=bias_type,
-            gelu=gelu,
-            gelu_in=gelu_in,
-            grad=grad,
-            workspace=workspace,
-            workspace_size=workspace_size,
-            accumulate=accumulate,
-            use_split_accumulator=use_split_accumulator,
-            comm_overlap=comm_overlap,
-            comm_type=comm_type,
-            extra_output=extra_output,
-            bulk_overlap=bulk_overlap,
-            alpha=alpha,
-            beta=beta,
+            A, transA, B, transB, D, quantizer, output_dtype,
+            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
+            accumulate, use_split_accumulator, comm_overlap, comm_type,
+            extra_output, bulk_overlap, alpha, beta
         )
 
-    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("te_general_grouped_gemm - not implemented in reference backend")
-
-    def quantize(self, tensor: torch.Tensor, quantizer: Any, output: Optional[torch.Tensor] = None, noop: Optional[torch.Tensor] = None) -> Any:
-        raise NotImplementedError("quantize - not implemented in reference backend")
-
-    def dequantize(self, input: torch.Tensor, otype: torch.dtype) -> torch.Tensor:
-        raise NotImplementedError("dequantize - not implemented in reference backend")
-
-    def bgrad_quantize(self, input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
-        raise NotImplementedError("bgrad_quantize - not implemented in reference backend")
-
+    # GELU and variants
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         return gelu_torch(input, quantizer)
 
@@ -151,6 +117,7 @@ def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         return qgeglu_torch(input, quantizer)
 
+    # ReLU and variants
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         return relu_torch(input, quantizer)
 
@@ -163,15 +130,23 @@ def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         return sreglu_torch(input, quantizer)
 
+    # SwiGLU and variants
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         return silu_torch(input, quantizer)
 
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         return swiglu_torch(input, quantizer)
 
-    def clamped_swiglu(self, input: torch.Tensor, quantizer: Any, limit: float = 7.0, alpha: float = 1.702) -> Any:
+    def clamped_swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
         return clamped_swiglu_torch(input, quantizer, limit, alpha)
 
+    # Backward of GELU and variants
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         return dgelu_torch(grad, fwd_input, quantizer)
 
@@ -184,6 +159,7 @@ def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         return dqgeglu_torch(grad, fwd_input, quantizer)
 
+    # Backward of ReLU and variants
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         return drelu_torch(grad, fwd_input, quantizer)
 
@@ -196,42 +172,77 @@ def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         return dsreglu_torch(grad, fwd_input, quantizer)
 
+    # Backward of SiLU and variants
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         return dsilu_torch(grad, fwd_input, quantizer)
 
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         return dswiglu_torch(grad, fwd_input, quantizer)
 
-    def clamped_dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any, limit: float = 7.0, alpha: float = 1.702) -> Any:
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
         return clamped_dswiglu_torch(grad, fwd_input, quantizer, limit, alpha)
 
-    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    # DBias + DAct fusions
+    def dbias_dgelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
         return dbias_dgelu_torch(grad, fwd_input, quantizer)
 
-    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsilu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
         return dbias_dsilu_torch(grad, fwd_input, quantizer)
 
-    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_drelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Tuple[torch.Tensor, Any]:
         return dbias_drelu_torch(grad, fwd_input, quantizer)
 
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dqgelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
         return dbias_dqgelu_torch(grad, fwd_input, quantizer)
 
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsrelu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
         return dbias_dsrelu_torch(grad, fwd_input, quantizer)
 
+    # LayerNorm
     def layernorm_fwd(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         bias: Optional[torch.Tensor],
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> List[Any]:
         return layernorm_fwd_torch(
             input=input,
             weight=weight,
@@ -246,16 +257,16 @@ def layernorm_fwd(
 
     def layernorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         mu: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         return layernorm_bwd_torch(
-            dy=dy,
+            dy=dz,
             x=x,
             mu=mu,
             rsigma=rsigma,
@@ -264,17 +275,18 @@ def layernorm_bwd(
             zero_centered_gamma=zero_centered_gamma,
         )
 
+    # RMSNorm
     def rmsnorm_fwd(
         self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
+        input: Any,
+        weight: Any,
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
         return rmsnorm_fwd_torch(
             input=input,
             weight=weight,
@@ -288,153 +300,126 @@ def rmsnorm_fwd(
 
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         return rmsnorm_bwd_torch(
-            dy=dy,
+            dy=dz,
             x=x,
             rsigma=rsigma,
             gamma=gamma,
             sm_margin=sm_margin,
             zero_centered_gamma=zero_centered_gamma,
-            eps=eps,
         )
 
-    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("rmsnorm_bwd_add - not implemented in reference backend")
-
-    def multi_tensor_quantize(self, tensor_list: List[torch.Tensor], quantizer_list: List[Any]) -> List[Any]:
-        raise NotImplementedError("multi_tensor_quantize - not implemented in reference backend")
-
-    def split_quantize(self, tensor: torch.Tensor, split_sections: List[int], quantizer_list: List[Any]) -> List[Any]:
-        raise NotImplementedError("split_quantize - not implemented in reference backend")
-
-    def moe_permute_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("moe_permute_fwd - not implemented in reference backend")
-
-    def moe_permute_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("moe_permute_bwd - not implemented in reference backend")
-
-    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("moe_unpermute_fwd - not implemented in reference backend")
-
-    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("moe_unpermute_bwd - not implemented in reference backend")
-
-    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
         return scaled_softmax_forward_torch(input, scale)
 
-    def scaled_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_softmax_backward_torch(output_grad, softmax_output, scale)
-
-    def scaled_masked_softmax_forward(self, input: torch.Tensor, mask: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_masked_softmax_forward_torch(input, mask, scale)
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_softmax_backward_torch(output_grad_, softmax_results_, scale_factor)
 
-    def scaled_masked_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_masked_softmax_backward_torch(output_grad, softmax_output, scale)
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_masked_softmax_forward_torch(input, mask, scale_factor)
 
-    def scaled_upper_triang_masked_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_upper_triang_masked_softmax_forward_torch(input, scale)
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_masked_softmax_backward_torch(output_grad_, softmax_results_, scale_factor)
 
-    def scaled_upper_triang_masked_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_upper_triang_masked_softmax_backward_torch(output_grad, softmax_output, scale)
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_upper_triang_masked_softmax_forward_torch(input, scale_factor)
 
-    def scaled_aligned_causal_masked_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_aligned_causal_masked_softmax_forward_torch(input, scale)
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_upper_triang_masked_softmax_backward_torch(output_grads_, softmax_results_, scale_factor)
 
-    def scaled_aligned_causal_masked_softmax_backward(self, output_grad: torch.Tensor, softmax_output: torch.Tensor, scale: float) -> torch.Tensor:
-        return scaled_aligned_causal_masked_softmax_backward_torch(output_grad, softmax_output, scale)
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_aligned_causal_masked_softmax_forward_torch(input, scale_factor)
 
-    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_aligned_causal_masked_softmax_backward_torch(output_grad_, softmax_results_, scale_factor)
+
+    # Fused attention backend
+    def get_fused_attn_backend(
+        self,
+        _is_training: bool,
+        _q_dtype: DType,
+        _kv_dtype: DType,
+        _qkv_layout: NVTE_QKV_Layout,
+        _bias_type: NVTE_Bias_Type,
+        _attn_mask_type: NVTE_Mask_Type,
+        _softmax_type: NVTE_Softmax_Type,
+        _p_dropout: float,
+        _num_attn_heads: int,
+        _num_gqa_groups: int,
+        _max_seqlen_q: int,
+        _max_seqlen_kv: int,
+        _head_dim_qk: int,
+        _head_dim_v: int,
+        _window_size_left: int,
+        _window_size_right: int,
+        _return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
         return NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
-    def fused_attn_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_attn_fwd - not implemented in reference backend")
-
-    def fused_attn_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_attn_bwd - not implemented in reference backend")
-
-    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fa_prepare_fwd - not implemented in reference backend")
-
-    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fa_prepare_bwd - not implemented in reference backend")
-
-    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("copy_to_kv_cache - not implemented in reference backend")
-
-    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("convert_thd_to_bshd - not implemented in reference backend")
-
-    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("convert_bshd_to_thd - not implemented in reference backend")
-
-    def fused_rope_forward(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_rope_forward - not implemented in reference backend")
-
-    def fused_rope_backward(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_rope_backward - not implemented in reference backend")
-
-    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_qkv_rope_forward - not implemented in reference backend")
-
-    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_qkv_rope_backward - not implemented in reference backend")
-
-    def fused_topk_with_score_function_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_topk_with_score_function_fwd - not implemented in reference backend")
-
-    def fused_topk_with_score_function_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_topk_with_score_function_bwd - not implemented in reference backend")
-
-    def fused_score_for_moe_aux_loss_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_score_for_moe_aux_loss_fwd - not implemented in reference backend")
-
-    def fused_score_for_moe_aux_loss_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_score_for_moe_aux_loss_bwd - not implemented in reference backend")
-
-    def fused_moe_aux_loss_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_moe_aux_loss_fwd - not implemented in reference backend")
-
-    def fused_moe_aux_loss_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_moe_aux_loss_bwd - not implemented in reference backend")
-
-    def dropout_fwd(self, input: torch.Tensor, dropout_probability: float, out: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Dropout
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         return dropout_fwd_torch(input, dropout_probability, out)
 
-    def dropout_bwd(self, grad_output: torch.Tensor, mask: torch.Tensor, dropout_probability: float, grad_input: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor],
+    ) -> torch.Tensor:
         return dropout_bwd_torch(grad_output, mask, dropout_probability, grad_input)
 
-    def fp8_transpose(self, input: torch.Tensor, dtype: Any, *, out: torch.Tensor) -> None:
-        raise NotImplementedError("fp8_transpose - not implemented in reference backend")
-
-    def swap_first_dims(self, tensor: torch.Tensor, *, out: torch.Tensor) -> None:
-        raise NotImplementedError("swap_first_dims - not implemented in reference backend")
-
-    def compute_amax(self, input: torch.Tensor, amax: torch.Tensor) -> None:
-        raise NotImplementedError("compute_amax - not implemented in reference backend")
-
-    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
-        raise NotImplementedError("fused_amax_and_scale_update_after_reduction - not implemented in reference backend")
-
-    def fp8_block_scaling_compute_partial_amax(self, *args, **kwargs) -> None:
-        raise NotImplementedError("fp8_block_scaling_compute_partial_amax - not implemented in reference backend")
-
-    def fp8_block_scaling_partial_cast(self, *args, **kwargs) -> None:
-        raise NotImplementedError("fp8_block_scaling_partial_cast - not implemented in reference backend")
-
-    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_multi_row_padding - not implemented in reference backend")
-
-    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_multi_row_unpadding - not implemented in reference backend")
-
+    # Misc
     def get_cublasLt_version(self) -> int:
         return 0
 
@@ -444,100 +429,101 @@ def get_cudnn_version(self) -> int:
     def get_num_cublas_streams(self) -> int:
         return 0
 
-    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("thd_read_half_tensor - not implemented in reference backend")
-
-    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("thd_second_half_lse_correction - not implemented in reference backend")
-
-    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("thd_read_second_half_lse - not implemented in reference backend")
-
-    def thd_out_correction(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("thd_out_correction - not implemented in reference backend")
-
-    def thd_grad_correction(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("thd_grad_correction - not implemented in reference backend")
-
-    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("thd_get_partitioned_indices - not implemented in reference backend")
-
-    def init_nvshmem_backend(self, *args, **kwargs) -> None:
-        raise NotImplementedError("init_nvshmem_backend - not implemented in reference backend")
-
-    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
-        raise NotImplementedError("create_nvshmem_tensor - not implemented in reference backend")
-
-    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
-        raise NotImplementedError("nvshmem_send_on_current_stream - not implemented in reference backend")
-
-    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
-        raise NotImplementedError("nvshmem_wait_on_current_stream - not implemented in reference backend")
-
-    def nvshmem_finalize(self) -> None:
-        raise NotImplementedError("nvshmem_finalize - not implemented in reference backend")
-
-    def multi_tensor_scale(self, chunk_size: int, noop_flag: torch.Tensor, tensor_lists: List[List[torch.Tensor]], scale: float) -> None:
+    # Multi-tensor functions
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
         return multi_tensor_scale_torch(chunk_size, noop_flag, tensor_lists, scale)
 
-    def multi_tensor_l2norm(self, chunk_size: int, noop_flag: torch.Tensor, tensor_lists: List[List[torch.Tensor]], per_tensor: bool = False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         return multi_tensor_l2norm_torch(chunk_size, noop_flag, tensor_lists, per_tensor)
 
-    def multi_tensor_unscale_l2norm(self, chunk_size: int, noop_flag: torch.Tensor, tensor_lists: List[List[torch.Tensor]], scale: torch.Tensor, per_tensor: bool = False) -> Union[torch.Tensor, List[torch.Tensor]]:
-        """Compute L2 norm after unscaling.
-
-        Note: scale parameter is actually inv_scale (1/loss_scale).
-        Unscaling means multiplying by inv_scale (= dividing by loss_scale).
-        """
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         if noop_flag.item() != 0:
-            if per_tensor:
-                return [torch.tensor(0.0, device=t.device) for t in tensor_lists[0]]
-            else:
-                return torch.tensor(0.0, device=tensor_lists[0][0].device)
+            device = tensor_lists[0][0].device if tensor_lists and tensor_lists[0] else 'cpu'
+            return torch.tensor(0.0, device=device), torch.tensor(0.0, device=device)
 
-        # Multiply by inv_scale (scale parameter is actually inverse scale)
+        # Multiply by inv_scale
         unscaled_tensors = []
         for tensor in tensor_lists[0]:
-            unscaled_tensors.append(tensor * scale.item())
+            unscaled_tensors.append(tensor * inv_scale.item())
 
         return multi_tensor_l2norm_torch(chunk_size, noop_flag, [unscaled_tensors], per_tensor)
 
-    def multi_tensor_adam(self, *args, **kwargs):
-        if not args and not kwargs:
-            return multi_tensor_adam_torch
-        return multi_tensor_adam_torch(*args, **kwargs)
-
-    def multi_tensor_adam_param_remainder(self, *args, **kwargs):
-        if not args and not kwargs:
-            return multi_tensor_adam_param_remainder_torch
-        return multi_tensor_adam_param_remainder_torch(*args, **kwargs)
-
-    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
-        raise NotImplementedError("multi_tensor_adam_fp8 - not implemented in reference backend")
-
-    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
-        raise NotImplementedError("multi_tensor_adam_capturable - not implemented in reference backend")
-
-    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
-        raise NotImplementedError("multi_tensor_adam_capturable_master - not implemented in reference backend")
-
-    def multi_tensor_sgd(self, *args, **kwargs) -> None:
-        return multi_tensor_sgd_torch(*args, **kwargs)
-
-    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
-        raise NotImplementedError("multi_tensor_compute_scale_and_scale_inv - not implemented in reference backend")
-
-    def bulk_overlap_ag_with_external_gemm(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("bulk_overlap_ag_with_external_gemm - not implemented in reference backend")
-
-    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
-        return FP8TensorMeta()
+    def multi_tensor_adam(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        return multi_tensor_adam_torch(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon, step, mode, bias_correction, weight_decay
+        )
 
-    def create_comm_overlap_helper(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("create_comm_overlap_helper - not implemented in reference backend")
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        return multi_tensor_adam_param_remainder_torch(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon, step, mode, bias_correction, weight_decay
+        )
 
-    def create_comm_overlap(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("create_comm_overlap - not implemented in reference backend")
+    def multi_tensor_sgd(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
+    ) -> None:
+        return multi_tensor_sgd_torch(
+            chunk_size, noop_flag, tensor_lists,
+            wd, momentum, dampening, lr, nesterov, first_run, wd_after_momentum, scale
+        )
 
-    def create_comm_overlap_p2p(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("create_comm_overlap_p2p - not implemented in reference backend")
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionTorch
+        return FlashAttentionTorch
diff --git a/transformer_engine/plugin/core/backends/reference/register_ops.py b/transformer_engine/plugin/core/backends/reference/register_ops.py
index 3d311a6c75..9ecbf10974 100644
--- a/transformer_engine/plugin/core/backends/reference/register_ops.py
+++ b/transformer_engine/plugin/core/backends/reference/register_ops.py
@@ -43,20 +43,11 @@ def register_builtins(registry) -> None:
         # Normalization
         OpImpl(op_name="rmsnorm_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor=None, priority=50),
         OpImpl(op_name="rmsnorm_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="rmsnorm_bwd_add", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor=None, priority=50),
         OpImpl(op_name="layernorm_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor=None, priority=50),
         OpImpl(op_name="layernorm_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor=None, priority=50),
 
         # GEMM
         OpImpl(op_name="generic_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="te_general_grouped_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor=None, priority=50),
-
-        # Quantization
-        OpImpl(op_name="quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.quantize, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dequantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dequantize, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="bgrad_quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="split_quantize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.split_quantize, is_avail), vendor=None, priority=50),
 
         # Activations - Forward
         OpImpl(op_name="gelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.gelu, is_avail), vendor=None, priority=50),
@@ -101,94 +92,25 @@ def register_builtins(registry) -> None:
         OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor=None, priority=50),
         OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor=None, priority=50),
 
-        # MOE operations
-        OpImpl(op_name="moe_permute_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="moe_permute_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="moe_unpermute_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="moe_unpermute_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor=None, priority=50),
-
-        # Fused attention
+        # Fused attention backend getter
         OpImpl(op_name="get_fused_attn_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_attn_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_attn_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fa_prepare_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fa_prepare_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor=None, priority=50),
-
-        # KV cache
-        OpImpl(op_name="copy_to_kv_cache", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor=None, priority=50),
-
-        # Tensor format conversions
-        OpImpl(op_name="convert_thd_to_bshd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="convert_bshd_to_thd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor=None, priority=50),
-
-        # RoPE (Rotary Position Embedding)
-        OpImpl(op_name="fused_rope_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_rope_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_qkv_rope_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_qkv_rope_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor=None, priority=50),
-
-        # TopK and MOE aux loss
-        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor=None, priority=50),
 
         # Dropout
         OpImpl(op_name="dropout_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor=None, priority=50),
         OpImpl(op_name="dropout_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor=None, priority=50),
 
-        # FP8 operations
-        OpImpl(op_name="fp8_transpose", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="swap_first_dims", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="compute_amax", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.compute_amax, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor=None, priority=50),
-
-        # Padding operations
-        OpImpl(op_name="fused_multi_row_padding", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="fused_multi_row_unpadding", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor=None, priority=50),
-
         # Library version getters
         OpImpl(op_name="get_cublasLt_version", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor=None, priority=50),
         OpImpl(op_name="get_cudnn_version", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor=None, priority=50),
         OpImpl(op_name="get_num_cublas_streams", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor=None, priority=50),
 
-        # THD (Tensor, Hidden, Dimension) operations
-        OpImpl(op_name="thd_read_half_tensor", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="thd_second_half_lse_correction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="thd_read_second_half_lse", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="thd_out_correction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="thd_grad_correction", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="thd_get_partitioned_indices", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor=None, priority=50),
-
-        # NVSHMEM operations
-        OpImpl(op_name="init_nvshmem_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="create_nvshmem_tensor", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="nvshmem_finalize", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor=None, priority=50),
-
         # Multi-tensor optimizer operations
         OpImpl(op_name="multi_tensor_scale", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor=None, priority=50),
         OpImpl(op_name="multi_tensor_l2norm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor=None, priority=50),
         OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor=None, priority=50),
         OpImpl(op_name="multi_tensor_adam", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor=None, priority=50),
         OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor=None, priority=50),
         OpImpl(op_name="multi_tensor_sgd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor=None, priority=50),
-
-        # Communication overlap operations
-        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="create_fp8_tensor_meta", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="create_comm_overlap_helper", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="create_comm_overlap", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="create_comm_overlap_p2p", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor=None, priority=50),
 
         # FlashAttention class getter
         OpImpl(op_name="get_flash_attention_class", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=50),
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 98ef965811..8be7dd5052 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -1,12 +1,11 @@
 # Copyright (c) 2025, BAAI. All rights reserved.
 #
 # See LICENSE for license information.
-
+import os
+import sys
 from typing import Any, Dict, List, Optional, Tuple, Union
-
 import torch
-
-from ....ops import TEFLBackendBase, FP8TensorMeta
+from ....ops import *
 
 def _load_cuda_libs():
     import ctypes
@@ -115,70 +114,6 @@ def _get_tex():
     import transformer_engine_torch_nv
     return transformer_engine_torch_nv
 
-def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
-    if torch_dtype is None:
-        return None
-
-    NativeDType = tex_module.DType
-    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_torch_nv':
-        return torch_dtype
-
-    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
-        from transformer_engine.plugin.core.ops import DType as PyDType
-        if isinstance(torch_dtype, PyDType):
-            dtype_name = torch_dtype.name
-            if hasattr(NativeDType, dtype_name):
-                return getattr(NativeDType, dtype_name)
-
-    dtype_map = {
-        torch.float32: NativeDType.kFloat32,
-        torch.float16: NativeDType.kFloat16,
-        torch.bfloat16: NativeDType.kBFloat16,
-        torch.int32: NativeDType.kInt32,
-        torch.uint8: NativeDType.kByte,
-    }
-
-    if hasattr(torch, 'float8_e4m3fn'):
-        dtype_map[torch.float8_e4m3fn] = NativeDType.kFloat8E4M3
-    if hasattr(torch, 'float8_e5m2'):
-        dtype_map[torch.float8_e5m2] = NativeDType.kFloat8E5M2
-
-    return dtype_map.get(torch_dtype, torch_dtype)
-
-def _convert_dtype_params(func):
-    import functools
-    import inspect
-    import os
-
-    @functools.wraps(func)
-    def wrapper(self, *args, **kwargs):
-        dtype_params = ['otype', 'output_dtype', 'bias_type']
-
-        from transformer_engine.plugin.core.ops import DType as PyDType
-
-        def needs_conversion(val):
-            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
-
-        for param_name in dtype_params:
-            if param_name in kwargs:
-                value = kwargs[param_name]
-                if needs_conversion(value):
-                    converted = self._to_te_dtype(value)
-                    kwargs[param_name] = converted
-
-        sig = inspect.signature(func)
-        param_names = list(sig.parameters.keys())[1:]
-
-        args_list = list(args)
-        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
-            if param_name in dtype_params and needs_conversion(arg_value):
-                converted = self._to_te_dtype(arg_value)
-                args_list[i] = converted
-
-        return func(self, *args_list, **kwargs)
-
-    return wrapper
-
 class CUDABackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -192,16 +127,9 @@ def _get_tex(self):
             self._tex = _get_tex()
         return self._tex
 
-    def _to_te_dtype(self, torch_dtype):
-        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
-
     def is_available(self) -> bool:
         return _check_cuda_available()
 
-    def get_flash_attention_class(self):
-        from .flash_attention import FlashAttentionCUDA
-        return FlashAttentionCUDA
-
     def get_attention_backend(self, attention_params=None):
         """
         CUDA backend uses the default attention backend selection logic.
@@ -214,6 +142,7 @@ def get_attention_backend(self, attention_params=None):
         from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
         return dpa_utils._original_get_attention_backend(attention_params)
 
+##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -224,35 +153,34 @@ def quantize(
         tex = self._get_tex()
         return tex.quantize(tensor, quantizer, output, noop)
 
-    @_convert_dtype_params
     def dequantize(
         self,
-        input: torch.Tensor,
-        otype: torch.dtype,
-    ) -> torch.Tensor:
+        input: Any,
+        otype: DType,
+    ) -> Any:
         tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
         return tex.dequantize(input, otype)
 
     def bgrad_quantize(
         self,
         input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.bgrad_quantize(input, quantizer)
 
-    @_convert_dtype_params
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -261,61 +189,53 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        if bias_type is None:
-            bias_type = self._to_te_dtype(torch.bfloat16)
-
+        
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
             A, transA, B, transB, D, quantizer, output_dtype,
             bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
             accumulate, use_split_accumulator, comm_overlap, comm_type,
             extra_output, bulk_overlap, alpha, beta
         )
-
-    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.te_general_grouped_gemm(*args, **kwargs)
-
+    # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
-
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
-
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
+    # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
-
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
-
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
-
+    # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
-
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
@@ -328,42 +248,39 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
-
+    # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
-
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
-
+    # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
-
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
-
+    # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
-
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -374,131 +291,207 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
-
-    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    # DBias + DAct fusions #
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
-
-    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-
-    @_convert_dtype_params
+    # Permutation functions                                                                                                                                                                                                                                                                                             
+    def moe_permute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        indices: torch.Tensor,
+        num_out_tokens: int,
+        workspace: List[torch.Tensor],
+        max_expanded_token_num: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+    def moe_permute_bwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_bwd(
+        self,
+        input_bwd: torch.Tensor,
+        input_fwd: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(
+            output_grads_, softmax_results_, scale_factor
+        )
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(
+            output_grad_, softmax_results_, scale_factor
+        )
+    # Other granular functions
     def layernorm_fwd(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         bias: Optional[torch.Tensor],
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, mu, rsigma = tex.layernorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-        return y, mu, rsigma
-
     def layernorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         mu: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dgamma, dbeta
-
-    @_convert_dtype_params
+        return tex.layernorm_bwd(
+            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
+        )
     def rmsnorm_fwd(
         self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
+        input: Any,
+        weight: Any,
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, y_quant, rsigma = tex.rmsnorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-            if y_quant is not None:
-                y_quant = y_quant.view(*orig_shape)
-        return y, y_quant, rsigma
-
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dw
-
-    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+    def rmsnorm_bwd_add(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        add: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.rmsnorm_bwd_add(*args, **kwargs)
+        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
 
     def multi_tensor_quantize(
         self,
@@ -507,7 +500,6 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
-
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -516,246 +508,457 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
-
-    def moe_permute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_permute_fwd(*args, **kwargs)
-
-    def moe_permute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_permute_bwd(*args, **kwargs)
-
-    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_unpermute_fwd(*args, **kwargs)
-
-    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_unpermute_bwd(*args, **kwargs)
-
-    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
-        tex = self._get_tex()
-        return tex.scaled_softmax_forward(input, scale)
-
-    def scaled_softmax_backward(
+    def te_general_grouped_gemm(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
-        tex = self._get_tex()
-        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
-
-    def scaled_masked_softmax_forward(
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
+        tex = self._get_tex()
+        D_type = tex.DType(int(D_type)) if D_type is not None else None
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        return tex.te_general_grouped_gemm(
+            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
+            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
+            accumulate, use_split_accumulator, math_sm_count
+        )
+    def fp8_transpose(
         self,
         input: torch.Tensor,
-        mask: torch.Tensor,
-        scale: float,
+        dtype: DType,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_forward(input, mask, scale)
-
-    def scaled_masked_softmax_backward(
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.fp8_transpose(input, dtype, out)
+    def swap_first_dims(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        tensor: torch.Tensor,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+        return tex.swap_first_dims(tensor, out)
+    def get_fused_attn_backend(
+        self,
+        is_training: bool,
+        q_dtype: DType,
+        kv_dtype: DType,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        p_dropout: float,
+        num_attn_heads: int,
+        num_gqa_groups: int,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        head_dim_qk: int,
+        head_dim_v: int,
+        window_size_left: int,
+        window_size_right: int,
+        return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
+        tex = self._get_tex()
+
+        q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
+        kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        result = tex.get_fused_attn_backend(
+            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
+            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
+            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+            head_dim_v, window_size_left, window_size_right, return_max_logit
+        )
+        return NVTE_Fused_Attn_Backend(result)
 
-    def scaled_upper_triang_masked_softmax_forward(
+    def compute_amax(
         self,
         input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
-
-    def scaled_upper_triang_masked_softmax_backward(
+        return tex.compute_amax(input, amax)
+    def fused_amax_and_scale_update_after_reduction(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax_reduction_buffer: torch.Tensor,
+        amax_histories: List[torch.Tensor],
+        scales: List[torch.Tensor],
+        amax_compute_algo: str,
+        fp8_dtype: DType,
+        margin: float,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def scaled_aligned_causal_masked_softmax_forward(
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.fused_amax_and_scale_update_after_reduction(
+            amax_reduction_buffer, amax_histories, scales,
+            amax_compute_algo, fp8_dtype, margin
+        )
+    def fp8_block_scaling_compute_partial_amax(
         self,
-        input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
-
-    def scaled_aligned_causal_masked_softmax_backward(
+        return tex.fp8_block_scaling_compute_partial_amax(
+            tensor, amax, h, w, start_offset, block_len
+        )
+    def fp8_block_scaling_partial_cast(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.fp8_block_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+    def fused_multi_row_padding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        padded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
-
-        args_list = list(args)
-
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-
-            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
-                return py_enum
-
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-
-            if hasattr(py_enum, 'value'):
-                enum_value = int(py_enum.value)
-                for member_name in dir(native_enum_class):
-                    if not member_name.startswith('_'):
-                        try:
-                            member = getattr(native_enum_class, member_name)
-                            if hasattr(member, 'value') and int(member.value) == enum_value:
-                                return member
-                        except:
-                            pass
-
-            if hasattr(py_enum, 'value'):
-                return int(py_enum.value)
-
-            return py_enum
-
-        if len(args) > 1:
-            args_list[1] = self._to_te_dtype(args[1])
-        if len(args) > 2:
-            args_list[2] = self._to_te_dtype(args[2])
-        if len(args) > 3:
-            args_list[3] = convert_enum(args[3], tex.NVTE_QKV_Layout)
-        if len(args) > 4:
-            args_list[4] = convert_enum(args[4], tex.NVTE_Bias_Type)
-        if len(args) > 5:
-            args_list[5] = convert_enum(args[5], tex.NVTE_Mask_Type)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_Softmax_Type)
-
-        return tex.get_fused_attn_backend(*args_list, **kwargs)
-
-    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        return tex.fused_multi_row_padding(
+            input, output, input_row_list, padded_input_row_list
+        )
+    def fused_multi_row_unpadding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        unpadded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(
+            input, output, input_row_list, unpadded_input_row_list
+        )
 
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
-                return py_enum
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-            return py_enum
-
-        args_list = list(args)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_QKV_Layout)
-        if len(args) > 7:
-            args_list[7] = convert_enum(args[7], tex.NVTE_Bias_Type)
-        if len(args) > 8:
-            args_list[8] = convert_enum(args[8], tex.NVTE_Mask_Type)
-        if len(args) > 9:
-            args_list[9] = convert_enum(args[9], tex.NVTE_Softmax_Type)
-
-        return tex.fused_attn_fwd(*args_list, **kwargs)
-
-    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+    # attention kernels
+    def fa_prepare_fwd(
+        self,
+        qkvi: torch.Tensor,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
-                return py_enum
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-            return py_enum
-
-        args_list = list(args)
-        if len(args) > 5:
-            args_list[5] = convert_enum(args[5], tex.NVTE_QKV_Layout)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_Bias_Type)
-        if len(args) > 7:
-            args_list[7] = convert_enum(args[7], tex.NVTE_Mask_Type)
-        if len(args) > 8:
-            args_list[8] = convert_enum(args[8], tex.NVTE_Softmax_Type)
-        if len(args) > 19:
-            args_list[19] = self._to_te_dtype(args[19])
-
-        if 'dqkv_dtype' in kwargs:
-            kwargs['dqkv_dtype'] = self._to_te_dtype(kwargs['dqkv_dtype'])
-
-        return tex.fused_attn_bwd(*args_list, **kwargs)
-
-    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_fwd(qkvi)
+    def fa_prepare_bwd(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fa_prepare_fwd(*args, **kwargs)
-
-    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_bwd(q, k, v)
+    def fused_attn_fwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        is_training: bool,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        fake_dtype: torch.dtype,
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        page_table_k: Optional[torch.Tensor],
+        page_table_v: Optional[torch.Tensor],
+        s_quantizer: Any,
+        o_quantizer: Any,
+        Bias: Optional[torch.Tensor],
+        SoftmaxOffset: Optional[torch.Tensor],
+        rng_gen: Optional[torch.Generator],
+        rng_elts_per_thread: int,
+        return_max_logit: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.fa_prepare_bwd(*args, **kwargs)
 
-    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        return tex.fused_attn_fwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            is_training,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            fake_dtype,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            page_table_k,
+            page_table_v,
+            s_quantizer,
+            o_quantizer,
+            Bias,
+            SoftmaxOffset,
+            rng_gen,
+            rng_elts_per_thread,
+            return_max_logit
+        )
+    def fused_attn_bwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        deterministic: bool,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        O: Any,
+        dO: Any,
+        fake_dtype: torch.dtype,
+        dqkv_type: DType,
+        Aux_CTX_Tensors: List[torch.Tensor],
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        s_quantizer: Any,
+        dp_quantizer: Any,
+        dqkv_quantizer: Any,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.copy_to_kv_cache(*args, **kwargs)
 
-    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
+
+        return tex.fused_attn_bwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            deterministic,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            O,
+            dO,
+            fake_dtype,
+            dqkv_type,
+            Aux_CTX_Tensors,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            s_quantizer,
+            dp_quantizer,
+            dqkv_quantizer
+        )
+    def copy_to_kv_cache(
+        self,
+        new_k: torch.Tensor,
+        new_v: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        page_table: torch.Tensor,
+        cu_new_lens: torch.Tensor,
+        cu_cached_lens: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        b: int,
+        max_ctx_len: int,
+        max_seq_len: int,
+        max_pages_per_seq: int,
+        is_non_paged: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.convert_thd_to_bshd(*args, **kwargs)
-
-    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.copy_to_kv_cache(
+            new_k,
+            new_v,
+            k_cache,
+            v_cache,
+            page_table,
+            cu_new_lens,
+            cu_cached_lens,
+            qkv_format,
+            b,
+            max_ctx_len,
+            max_seq_len,
+            max_pages_per_seq,
+            is_non_paged
+        )
+    def convert_thd_to_bshd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        b: int,
+        max_seq_len: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.convert_bshd_to_thd(*args, **kwargs)
-
-    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+    def convert_bshd_to_thd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        t: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_forward(*args, **kwargs)
+        return tex.convert_bshd_to_thd(tensor, cu_seqlens, t)
 
-    def fused_rope_backward(self, *args, **kwargs) -> Any:
+    # fused apply rope
+    def fused_rope_forward(
+        self,
+        input: torch.Tensor,
+        freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_backward(*args, **kwargs)
-
-    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_forward(
+            input, freqs, start_positions, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_rope_backward(
+        self,
+        output_grads: torch.Tensor,
+        freqs: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_forward(*args, **kwargs)
-
-    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_backward(
+            output_grads, freqs, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_qkv_rope_forward(
+        self,
+        qkv_input: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_backward(*args, **kwargs)
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_forward(
+            qkv_input, q_freqs, k_freqs, start_positions,
+            qkv_split_arg_list, qkv_format, interleaved,
+            cp_size, cp_rank
+        )
+    def fused_qkv_rope_backward(
+        self,
+        q_grad_out: torch.Tensor,
+        k_grad_out: torch.Tensor,
+        v_grad_out: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_backward(
+            q_grad_out, k_grad_out, v_grad_out,
+            q_freqs, k_freqs, qkv_split_arg_list,
+            qkv_format, interleaved, cp_size, cp_rank
+        )
 
+    # fused router
     def fused_topk_with_score_function_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        num_groups: int,
-        group_topk: int,
-        scaling_factor: float,
-        score_function: Any,
+        num_groups: Optional[int],
+        group_topk: Optional[int],
+        scaling_factor: Optional[float],
+        score_function: str,
         expert_bias: Optional[torch.Tensor],
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_fwd(
-            logits, topk, use_pre_softmax, num_groups, group_topk,
-            scaling_factor, score_function, expert_bias
+            logits,
+            topk,
+            use_pre_softmax,
+            num_groups,
+            group_topk,
+            scaling_factor,
+            score_function,
+            expert_bias,
         )
-
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -765,24 +968,33 @@ def fused_topk_with_score_function_bwd(
         grad_probs: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        scaling_factor: float,
-        score_function: Any,
-    ) -> Any:
+        scaling_factor: Optional[float],
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_bwd(
-            num_tokens, num_experts, routing_map, intermediate_output,
-            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+            num_tokens,
+            num_experts,
+            routing_map,
+            intermediate_output,
+            grad_probs,
+            topk,
+            use_pre_softmax,
+            scaling_factor,
+            score_function,
         )
-
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
-
+        return tex.fused_score_for_moe_aux_loss_fwd(
+            logits,
+            topk,
+            score_function,
+        )
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -790,13 +1002,17 @@ def fused_score_for_moe_aux_loss_bwd(
         intermediate_output: torch.Tensor,
         grad_scores: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_score_for_moe_aux_loss_bwd(
-            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+            num_tokens,
+            num_experts,
+            intermediate_output,
+            grad_scores,
+            topk,
+            score_function,
         )
-
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -807,13 +1023,18 @@ def fused_moe_aux_loss_fwd(
         num_cols: int,
         topk: int,
         coeff: float,
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_moe_aux_loss_fwd(
-            probs, tokens_per_expert, total_num_tokens, num_experts,
-            num_rows, num_cols, topk, coeff
+            probs,
+            tokens_per_expert,
+            total_num_tokens,
+            num_experts,
+            num_rows,
+            num_cols,
+            topk,
+            coeff,
         )
-
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -821,152 +1042,146 @@ def fused_moe_aux_loss_bwd(
         num_rows: int,
         num_cols: int,
         grad_aux_loss: torch.Tensor,
-    ) -> Any:
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(
-            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
-        )
+        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
 
+    # Dropout
     def dropout_fwd(
         self,
         input: torch.Tensor,
         dropout_probability: float,
-        out: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
-
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
         mask: torch.Tensor,
         dropout_probability: float,
-        grad_input: Optional[torch.Tensor] = None,
+        grad_input: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
 
-    def fp8_transpose(
-        self,
-        input: torch.Tensor,
-        dtype: Any,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_transpose(input, dtype, out=out)
-
-    def swap_first_dims(
-        self,
-        tensor: torch.Tensor,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.swap_first_dims(tensor, out=out)
-
-    def compute_amax(
-        self,
-        input: torch.Tensor,
-        amax: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.compute_amax(input, amax)
-
-    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
-        tex = self._get_tex()
-        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
-
-    def fp8_block_scaling_compute_partial_amax(
-        self,
-        tensor: torch.Tensor,
-        amax: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
-
-    def fp8_block_scaling_partial_cast(
-        self,
-        inp: torch.Tensor,
-        out: torch.Tensor,
-        scale: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-        out_dtype: Any,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
-
-    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_padding(*args, **kwargs)
-
-    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(*args, **kwargs)
-
+    # Misc
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
-
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
-
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
 
-    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+    # Support THD format for Context Parallel
+    def thd_read_half_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        half_idx: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_half_tensor(*args, **kwargs)
-
-    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+    def thd_second_half_lse_correction(
+        self,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(*args, **kwargs)
-
-    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        return tex.thd_second_half_lse_correction(
+            lse, lse_per_step, cu_seqlens, lse_packed
+        )
+    def thd_read_second_half_lse(
+        self,
+        lse: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+        second_half_lse_seqlen: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(*args, **kwargs)
-
-    def thd_out_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_read_second_half_lse(
+            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
+        )
+    def thd_out_correction(
+        self,
+        out: torch.Tensor,
+        out_per_step: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        only_second_half: bool,
+        lse_packed: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_out_correction(*args, **kwargs)
-
-    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_out_correction(
+            out, out_per_step, lse, lse_per_step,
+            cu_seqlens, only_second_half, lse_packed
+        )
+    def thd_grad_correction(
+        self,
+        grad: torch.Tensor,
+        grad_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        first_half: str,
+        second_half: str,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(*args, **kwargs)
-
-    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        return tex.thd_grad_correction(
+            grad, grad_per_step, cu_seqlens,
+            first_half, second_half
+        )
+    def thd_get_partitioned_indices(
+        self,
+        cu_seqlens: torch.Tensor,
+        total_tokens: int,
+        world_size: int,
+        rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(*args, **kwargs)
+        return tex.thd_get_partitioned_indices(
+            cu_seqlens, total_tokens, world_size, rank
+        )
 
-    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+    # nvshmem functions
+    def init_nvshmem_backend(
+        self,
+        process_group: Any,
+    ) -> None:
         tex = self._get_tex()
-        tex.init_nvshmem_backend(*args, **kwargs)
-
-    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        return tex.init_nvshmem_backend(process_group)
+    def create_nvshmem_tensor(
+        self,
+        shape: List[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.create_nvshmem_tensor(*args, **kwargs)
-
-    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        return tex.create_nvshmem_tensor(shape, dtype)
+    def nvshmem_send_on_current_stream(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        peer: int,
+        signal: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.nvshmem_send_on_current_stream(*args, **kwargs)
-
-    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+    def nvshmem_wait_on_current_stream(
+        self,
+        signal: torch.Tensor,
+        wait_kind: str,
+    ) -> None:
         tex = self._get_tex()
-        tex.nvshmem_wait_on_current_stream(*args, **kwargs)
-
+        return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
-        tex.nvshmem_finalize()
+        return tex.nvshmem_finalize()
 
+    # multi-tensor functions
     def multi_tensor_scale(
         self,
         chunk_size: int,
@@ -975,98 +1190,195 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
-
+        return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
-
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        scale: torch.Tensor,
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
-
+        return tex.multi_tensor_unscale_l2norm(
+            chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
+        )
     def multi_tensor_adam(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
-        tex = self._get_tex()
-        if chunk_size is None:
-            return tex.multi_tensor_adam
-        tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
-            eps, step, mode, bias_correction, weight_decay
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
         )
-
-    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
-
-    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_param_remainder(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
+        )
+    def multi_tensor_adam_fp8(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        fp8_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_fp8(*args, **kwargs)
-
-    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.multi_tensor_adam_fp8(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            fp8_dtype
+        )
+    def multi_tensor_adam_capturable(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable(*args, **kwargs)
-
-    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_adam_capturable_master(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
-
-    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable_master(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_sgd(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_sgd(*args, **kwargs)
-
-    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_sgd(
+            chunk_size, noop_flag, tensor_lists,
+            wd, momentum, dampening,
+            lr, nesterov, first_run,
+            wd_after_momentum, scale
+        )
+    def multi_tensor_compute_scale_and_scale_inv(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        max_fp8: float,
+        force_pow_2_scales: bool,
+        epsilon: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+        return tex.multi_tensor_compute_scale_and_scale_inv(
+            chunk_size, noop_flag, tensor_lists,
+            max_fp8, force_pow_2_scales, epsilon
+        )
 
+    # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
-        allgather_communicator: Any,
+        allgather_communicator: CommOverlap,
         send_stream: Any,
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
         return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
 
+############## class func #################################
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionCUDA
+        return FlashAttentionCUDA
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
-
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
         intra_node_group: Optional[Any] = None,
-    ) -> Any:
+    ) -> "CommOverlapHelper":
         tex = self._get_tex()
-        if world_group is None:
-            return tex.CommOverlapHelper()
         return tex.CommOverlapHelper(world_group, intra_node_group)
-
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1082,7 +1394,7 @@ def create_comm_overlap(
         set_sm_margin: bool = True,
         atomic_gemm: bool = False,
         rs_overlap_first_gemm: bool = False,
-    ) -> Any:
+    ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
             buffer_shape, buffer_dtype, helper, tp_size,
@@ -1090,7 +1402,6 @@ def create_comm_overlap(
             gemm_priority, comm_priority, num_comm_sm,
             set_sm_margin, atomic_gemm, rs_overlap_first_gemm
         )
-
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1107,7 +1418,7 @@ def create_comm_overlap_p2p(
         atomic_gemm: bool = False,
         use_ce: bool = True,
         aggregate: bool = False,
-    ) -> Any:
+    ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
             buffer_shape, buffer_dtype, helper, tp_size, comm_type,
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index 92e8868ed9..c87aef8430 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -5,10 +5,8 @@
 import os
 import sys
 from typing import Any, Dict, List, Optional, Tuple, Union
-
 import torch
-
-from ....ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+from ....ops import *
 
 def _load_hygon_libs():
     import ctypes
@@ -78,69 +76,6 @@ def _get_tex():
     import transformer_engine_torch_hygon
     return transformer_engine_torch_hygon
 
-def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
-    if torch_dtype is None:
-        return None
-
-    NativeDType = tex_module.DType
-    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_torch_hygon':
-        return torch_dtype
-
-    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
-        from transformer_engine.plugin.core.ops import DType as PyDType
-        if isinstance(torch_dtype, PyDType):
-            dtype_name = torch_dtype.name
-            if hasattr(NativeDType, dtype_name):
-                return getattr(NativeDType, dtype_name)
-
-    dtype_map = {
-        torch.float32: NativeDType.kFloat32,
-        torch.float16: NativeDType.kFloat16,
-        torch.bfloat16: NativeDType.kBFloat16,
-        torch.int32: NativeDType.kInt32,
-        torch.uint8: NativeDType.kByte,
-    }
-
-    if hasattr(torch, 'float8_e4m3fn'):
-        dtype_map[torch.float8_e4m3fn] = NativeDType.kFloat8E4M3
-    if hasattr(torch, 'float8_e5m2'):
-        dtype_map[torch.float8_e5m2] = NativeDType.kFloat8E5M2
-
-    return dtype_map.get(torch_dtype, torch_dtype)
-
-def _convert_dtype_params(func):
-    import functools
-    import inspect
-
-    @functools.wraps(func)
-    def wrapper(self, *args, **kwargs):
-        dtype_params = ['otype', 'output_dtype', 'bias_type']
-
-        from transformer_engine.plugin.core.ops import DType as PyDType
-
-        def needs_conversion(val):
-            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
-
-        for param_name in dtype_params:
-            if param_name in kwargs:
-                value = kwargs[param_name]
-                if needs_conversion(value):
-                    converted = self._to_te_dtype(value)
-                    kwargs[param_name] = converted
-
-        sig = inspect.signature(func)
-        param_names = list(sig.parameters.keys())[1:]
-
-        args_list = list(args)
-        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
-            if param_name in dtype_params and needs_conversion(arg_value):
-                converted = self._to_te_dtype(arg_value)
-                args_list[i] = converted
-
-        return func(self, *args_list, **kwargs)
-
-    return wrapper
-
 class HygonBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -154,16 +89,9 @@ def _get_tex(self):
             self._tex = _get_tex()
         return self._tex
 
-    def _to_te_dtype(self, torch_dtype):
-        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
-
     def is_available(self) -> bool:
         return _check_hygon_available()
 
-    def get_flash_attention_class(self):
-        from .flash_attention import FlashAttentionHYGON
-        return FlashAttentionHYGON
-
     def get_attention_backend(self, attention_params=None):
         from packaging.version import Version as PkgVersion
         from ....logger_manager import get_logger
@@ -196,6 +124,7 @@ def get_attention_backend(self, attention_params=None):
             available_backends,
         )
 
+##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -206,35 +135,34 @@ def quantize(
         tex = self._get_tex()
         return tex.quantize(tensor, quantizer, output, noop)
 
-    @_convert_dtype_params
     def dequantize(
         self,
-        input: torch.Tensor,
-        otype: torch.dtype,
-    ) -> torch.Tensor:
+        input: Any,
+        otype: DType,
+    ) -> Any:
         tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
         return tex.dequantize(input, otype)
 
     def bgrad_quantize(
         self,
         input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.bgrad_quantize(input, quantizer)
 
-    @_convert_dtype_params
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -243,68 +171,56 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        if bias_type is None:
-            bias_type = self._to_te_dtype(torch.bfloat16)
-
+        
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
             A, transA, B, transB, D, quantizer, output_dtype,
             bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
             accumulate, use_split_accumulator, comm_overlap, comm_type,
             extra_output, bulk_overlap, alpha, beta
         )
-
-    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.te_general_grouped_gemm(*args, **kwargs)
-
+    # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
-
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
-
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
-
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
-
+    # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
-
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
-
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
-
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
-
+    # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
-
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
-
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -314,47 +230,39 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
-
+    # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
-
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
-
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
-    
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
-
+    # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
-
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
-
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
-
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
-
+    # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
-
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
-
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -365,131 +273,207 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
-
-    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    # DBias + DAct fusions #
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
-
-    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-
-    @_convert_dtype_params
+    # Permutation functions                                                                                                                                                                                                                                                                                             
+    def moe_permute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        indices: torch.Tensor,
+        num_out_tokens: int,
+        workspace: List[torch.Tensor],
+        max_expanded_token_num: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+    def moe_permute_bwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_bwd(
+        self,
+        input_bwd: torch.Tensor,
+        input_fwd: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(
+            output_grads_, softmax_results_, scale_factor
+        )
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(
+            output_grad_, softmax_results_, scale_factor
+        )
+    # Other granular functions
     def layernorm_fwd(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         bias: Optional[torch.Tensor],
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, mu, rsigma = tex.layernorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-        return y, mu, rsigma
-
     def layernorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         mu: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dgamma, dbeta
-
-    @_convert_dtype_params
+        return tex.layernorm_bwd(
+            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
+        )
     def rmsnorm_fwd(
         self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
+        input: Any,
+        weight: Any,
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, y_quant, rsigma = tex.rmsnorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-            if y_quant is not None:
-                y_quant = y_quant.view(*orig_shape)
-        return y, y_quant, rsigma
-
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dw
-
-    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+    def rmsnorm_bwd_add(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        add: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.rmsnorm_bwd_add(*args, **kwargs)
+        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
 
     def multi_tensor_quantize(
         self,
@@ -498,7 +482,6 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
-
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -507,150 +490,457 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
-
-    def moe_permute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_permute_fwd(*args, **kwargs)
-
-    def moe_permute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_permute_bwd(*args, **kwargs)
-
-    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_unpermute_fwd(*args, **kwargs)
-
-    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_unpermute_bwd(*args, **kwargs)
-
-    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
+    def te_general_grouped_gemm(
+        self,
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
+        tex = self._get_tex()
+        D_type = tex.DType(int(D_type)) if D_type is not None else None
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        return tex.te_general_grouped_gemm(
+            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
+            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
+            accumulate, use_split_accumulator, math_sm_count
+        )
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        out: Optional[torch.Tensor],
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_softmax_forward(input, scale)
-
-    def scaled_softmax_backward(
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.fp8_transpose(input, dtype, out)
+    def swap_first_dims(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        tensor: torch.Tensor,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
+        return tex.swap_first_dims(tensor, out)
+    def get_fused_attn_backend(
+        self,
+        is_training: bool,
+        q_dtype: DType,
+        kv_dtype: DType,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        p_dropout: float,
+        num_attn_heads: int,
+        num_gqa_groups: int,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        head_dim_qk: int,
+        head_dim_v: int,
+        window_size_left: int,
+        window_size_right: int,
+        return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
+        tex = self._get_tex()
+
+        q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
+        kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        result = tex.get_fused_attn_backend(
+            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
+            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
+            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+            head_dim_v, window_size_left, window_size_right, return_max_logit
+        )
+        return NVTE_Fused_Attn_Backend(result)
 
-    def scaled_masked_softmax_forward(
+    def compute_amax(
         self,
         input: torch.Tensor,
-        mask: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_forward(input, mask, scale)
-
-    def scaled_masked_softmax_backward(
+        return tex.compute_amax(input, amax)
+    def fused_amax_and_scale_update_after_reduction(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax_reduction_buffer: torch.Tensor,
+        amax_histories: List[torch.Tensor],
+        scales: List[torch.Tensor],
+        amax_compute_algo: str,
+        fp8_dtype: DType,
+        margin: float,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def scaled_upper_triang_masked_softmax_forward(
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.fused_amax_and_scale_update_after_reduction(
+            amax_reduction_buffer, amax_histories, scales,
+            amax_compute_algo, fp8_dtype, margin
+        )
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.fp8_block_scaling_compute_partial_amax(
+            tensor, amax, h, w, start_offset, block_len
+        )
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
+        tex = self._get_tex()
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.fp8_block_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+    def fused_multi_row_padding(
         self,
         input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        output: torch.Tensor,
+        input_row_list: List[int],
+        padded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
-
-    def scaled_upper_triang_masked_softmax_backward(
+        return tex.fused_multi_row_padding(
+            input, output, input_row_list, padded_input_row_list
+        )
+    def fused_multi_row_unpadding(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        unpadded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)
+        return tex.fused_multi_row_unpadding(
+            input, output, input_row_list, unpadded_input_row_list
+        )
 
-    def scaled_aligned_causal_masked_softmax_forward(
+    # attention kernels
+    def fa_prepare_fwd(
         self,
-        input: torch.Tensor,
-        scale: float,
+        qkvi: torch.Tensor,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
-
-    def scaled_aligned_causal_masked_softmax_backward(
+        return tex.fa_prepare_fwd(qkvi)
+    def fa_prepare_bwd(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def get_fused_attn_backend(self, *args, **kwargs) -> int:
-        raise NotImplementedError("get_fused_attn_backend - not implemented in hygon backend")
-
-    def fused_attn_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_attn_fwd - not implemented in hygon backend")
-
-    def fused_attn_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("fused_attn_bwd - not implemented in hygon backend")
-
-    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_bwd(q, k, v)
+    def fused_attn_fwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        is_training: bool,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        fake_dtype: torch.dtype,
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        page_table_k: Optional[torch.Tensor],
+        page_table_v: Optional[torch.Tensor],
+        s_quantizer: Any,
+        o_quantizer: Any,
+        Bias: Optional[torch.Tensor],
+        SoftmaxOffset: Optional[torch.Tensor],
+        rng_gen: Optional[torch.Generator],
+        rng_elts_per_thread: int,
+        return_max_logit: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.fa_prepare_fwd(*args, **kwargs)
 
-    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        return tex.fused_attn_fwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            is_training,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            fake_dtype,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            page_table_k,
+            page_table_v,
+            s_quantizer,
+            o_quantizer,
+            Bias,
+            SoftmaxOffset,
+            rng_gen,
+            rng_elts_per_thread,
+            return_max_logit
+        )
+    def fused_attn_bwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        deterministic: bool,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        O: Any,
+        dO: Any,
+        fake_dtype: torch.dtype,
+        dqkv_type: DType,
+        Aux_CTX_Tensors: List[torch.Tensor],
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        s_quantizer: Any,
+        dp_quantizer: Any,
+        dqkv_quantizer: Any,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.fa_prepare_bwd(*args, **kwargs)
 
-    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
+
+        return tex.fused_attn_bwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            deterministic,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            O,
+            dO,
+            fake_dtype,
+            dqkv_type,
+            Aux_CTX_Tensors,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            s_quantizer,
+            dp_quantizer,
+            dqkv_quantizer
+        )
+    def copy_to_kv_cache(
+        self,
+        new_k: torch.Tensor,
+        new_v: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        page_table: torch.Tensor,
+        cu_new_lens: torch.Tensor,
+        cu_cached_lens: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        b: int,
+        max_ctx_len: int,
+        max_seq_len: int,
+        max_pages_per_seq: int,
+        is_non_paged: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.copy_to_kv_cache(*args, **kwargs)
-
-    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.copy_to_kv_cache(
+            new_k,
+            new_v,
+            k_cache,
+            v_cache,
+            page_table,
+            cu_new_lens,
+            cu_cached_lens,
+            qkv_format,
+            b,
+            max_ctx_len,
+            max_seq_len,
+            max_pages_per_seq,
+            is_non_paged
+        )
+    def convert_thd_to_bshd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        b: int,
+        max_seq_len: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.convert_thd_to_bshd(*args, **kwargs)
-
-    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+    def convert_bshd_to_thd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        t: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.convert_bshd_to_thd(*args, **kwargs)
+        return tex.convert_bshd_to_thd(tensor, cu_seqlens, t)
 
-    def fused_rope_forward(self, *args, **kwargs) -> Any:
+    # fused apply rope
+    def fused_rope_forward(
+        self,
+        input: torch.Tensor,
+        freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_forward(*args, **kwargs)
-
-    def fused_rope_backward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_forward(
+            input, freqs, start_positions, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_rope_backward(
+        self,
+        output_grads: torch.Tensor,
+        freqs: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_backward(*args, **kwargs)
-
-    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_backward(
+            output_grads, freqs, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_qkv_rope_forward(
+        self,
+        qkv_input: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_forward(*args, **kwargs)
-
-    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_forward(
+            qkv_input, q_freqs, k_freqs, start_positions,
+            qkv_split_arg_list, qkv_format, interleaved,
+            cp_size, cp_rank
+        )
+    def fused_qkv_rope_backward(
+        self,
+        q_grad_out: torch.Tensor,
+        k_grad_out: torch.Tensor,
+        v_grad_out: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_backward(*args, **kwargs)
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_backward(
+            q_grad_out, k_grad_out, v_grad_out,
+            q_freqs, k_freqs, qkv_split_arg_list,
+            qkv_format, interleaved, cp_size, cp_rank
+        )
 
+    # fused router
     def fused_topk_with_score_function_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        num_groups: int,
-        group_topk: int,
-        scaling_factor: float,
-        score_function: Any,
+        num_groups: Optional[int],
+        group_topk: Optional[int],
+        scaling_factor: Optional[float],
+        score_function: str,
         expert_bias: Optional[torch.Tensor],
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_fwd(
-            logits, topk, use_pre_softmax, num_groups, group_topk,
-            scaling_factor, score_function, expert_bias
+            logits,
+            topk,
+            use_pre_softmax,
+            num_groups,
+            group_topk,
+            scaling_factor,
+            score_function,
+            expert_bias,
         )
-
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -660,24 +950,33 @@ def fused_topk_with_score_function_bwd(
         grad_probs: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        scaling_factor: float,
-        score_function: Any,
-    ) -> Any:
+        scaling_factor: Optional[float],
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_bwd(
-            num_tokens, num_experts, routing_map, intermediate_output,
-            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+            num_tokens,
+            num_experts,
+            routing_map,
+            intermediate_output,
+            grad_probs,
+            topk,
+            use_pre_softmax,
+            scaling_factor,
+            score_function,
         )
-
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
-
+        return tex.fused_score_for_moe_aux_loss_fwd(
+            logits,
+            topk,
+            score_function,
+        )
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -685,13 +984,17 @@ def fused_score_for_moe_aux_loss_bwd(
         intermediate_output: torch.Tensor,
         grad_scores: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_score_for_moe_aux_loss_bwd(
-            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+            num_tokens,
+            num_experts,
+            intermediate_output,
+            grad_scores,
+            topk,
+            score_function,
         )
-
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -702,13 +1005,18 @@ def fused_moe_aux_loss_fwd(
         num_cols: int,
         topk: int,
         coeff: float,
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_moe_aux_loss_fwd(
-            probs, tokens_per_expert, total_num_tokens, num_experts,
-            num_rows, num_cols, topk, coeff
+            probs,
+            tokens_per_expert,
+            total_num_tokens,
+            num_experts,
+            num_rows,
+            num_cols,
+            topk,
+            coeff,
         )
-
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -716,147 +1024,146 @@ def fused_moe_aux_loss_bwd(
         num_rows: int,
         num_cols: int,
         grad_aux_loss: torch.Tensor,
-    ) -> Any:
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(
-            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
-        )
+        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
 
+    # Dropout
     def dropout_fwd(
         self,
         input: torch.Tensor,
         dropout_probability: float,
-        out: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
-
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
         mask: torch.Tensor,
         dropout_probability: float,
-        grad_input: Optional[torch.Tensor] = None,
+        grad_input: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
 
-    def fp8_transpose(
-        self,
-        input: torch.Tensor,
-        dtype: Any,
-        *,
-        out: torch.Tensor,
-    ) -> None:
+    # Misc
+    def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
-        tex.fp8_transpose(input, dtype, out=out)
+        return tex.get_cublasLt_version()
+    def get_cudnn_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cudnn_version()
+    def get_num_cublas_streams(self) -> int:
+        tex = self._get_tex()
+        return tex.get_num_cublas_streams()
 
-    def swap_first_dims(
+    # Support THD format for Context Parallel
+    def thd_read_half_tensor(
         self,
         tensor: torch.Tensor,
-        *,
-        out: torch.Tensor,
-    ) -> None:
+        cu_seqlens: torch.Tensor,
+        half_idx: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        tex.swap_first_dims(tensor, out=out)
-
-    def compute_amax(
+        return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+    def thd_second_half_lse_correction(
         self,
-        input: torch.Tensor,
-        amax: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        tex.compute_amax(input, amax)
-
-    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
-        tex = self._get_tex()
-        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
-
-    def fp8_block_scaling_compute_partial_amax(
+        return tex.thd_second_half_lse_correction(
+            lse, lse_per_step, cu_seqlens, lse_packed
+        )
+    def thd_read_second_half_lse(
         self,
-        tensor: torch.Tensor,
-        amax: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-    ) -> None:
+        lse: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+        second_half_lse_seqlen: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
-
-    def fp8_block_scaling_partial_cast(
+        return tex.thd_read_second_half_lse(
+            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
+        )
+    def thd_out_correction(
         self,
-        inp: torch.Tensor,
         out: torch.Tensor,
-        scale: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-        out_dtype: Any,
+        out_per_step: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        only_second_half: bool,
+        lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
-
-    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_padding(*args, **kwargs)
-
-    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(*args, **kwargs)
-
-    def get_cublasLt_version(self) -> int:
-        tex = self._get_tex()
-        return tex.get_cublasLt_version()
-
-    def get_cudnn_version(self) -> int:
-        tex = self._get_tex()
-        return tex.get_cudnn_version()
-
-    def get_num_cublas_streams(self) -> int:
-        tex = self._get_tex()
-        return tex.get_num_cublas_streams()
-
-    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+        return tex.thd_out_correction(
+            out, out_per_step, lse, lse_per_step,
+            cu_seqlens, only_second_half, lse_packed
+        )
+    def thd_grad_correction(
+        self,
+        grad: torch.Tensor,
+        grad_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        first_half: str,
+        second_half: str,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_read_half_tensor(*args, **kwargs)
-
-    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_grad_correction(
+            grad, grad_per_step, cu_seqlens,
+            first_half, second_half
+        )
+    def thd_get_partitioned_indices(
+        self,
+        cu_seqlens: torch.Tensor,
+        total_tokens: int,
+        world_size: int,
+        rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(*args, **kwargs)
+        return tex.thd_get_partitioned_indices(
+            cu_seqlens, total_tokens, world_size, rank
+        )
 
-    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+    # nvshmem functions
+    def init_nvshmem_backend(
+        self,
+        process_group: Any,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(*args, **kwargs)
-
-    def thd_out_correction(self, *args, **kwargs) -> Any:
+        return tex.init_nvshmem_backend(process_group)
+    def create_nvshmem_tensor(
+        self,
+        shape: List[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_out_correction(*args, **kwargs)
-
-    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        return tex.create_nvshmem_tensor(shape, dtype)
+    def nvshmem_send_on_current_stream(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        peer: int,
+        signal: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(*args, **kwargs)
-
-    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+    def nvshmem_wait_on_current_stream(
+        self,
+        signal: torch.Tensor,
+        wait_kind: str,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(*args, **kwargs)
-
-    def init_nvshmem_backend(self, *args, **kwargs) -> None:
-        raise NotImplementedError("init_nvshmem_backend - not implemented in hygon backend")
-
-    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
-        raise NotImplementedError("create_nvshmem_tensor - not implemented in hygon backend")
-
-    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
-        raise NotImplementedError("nvshmem_send_on_current_stream - not implemented in hygon backend")
-
-    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
-        raise NotImplementedError("nvshmem_wait_on_current_stream - not implemented in hygon backend")
-
+        return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
     def nvshmem_finalize(self) -> None:
-        raise NotImplementedError("nvshmem_finalize - not implemented in hygon backend")
+        tex = self._get_tex()
+        return tex.nvshmem_finalize()
 
+    # multi-tensor functions
     def multi_tensor_scale(
         self,
         chunk_size: int,
@@ -865,98 +1172,195 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
-
+        return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
-
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        scale: torch.Tensor,
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
-
+        return tex.multi_tensor_unscale_l2norm(
+            chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
+        )
     def multi_tensor_adam(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         tex = self._get_tex()
-        if chunk_size is None:
-            return tex.multi_tensor_adam
-        tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
-            eps, step, mode, bias_correction, weight_decay
+        return tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
         )
-
-    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
-
-    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_param_remainder(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
+        )
+    def multi_tensor_adam_fp8(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        fp8_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_fp8(*args, **kwargs)
-
-    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.multi_tensor_adam_fp8(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            fp8_dtype
+        )
+    def multi_tensor_adam_capturable(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable(*args, **kwargs)
-
-    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_adam_capturable_master(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
-
-    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable_master(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_sgd(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_sgd(*args, **kwargs)
-
-    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_sgd(
+            chunk_size, noop_flag, tensor_lists,
+            wd, momentum, dampening,
+            lr, nesterov, first_run,
+            wd_after_momentum, scale
+        )
+    def multi_tensor_compute_scale_and_scale_inv(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        max_fp8: float,
+        force_pow_2_scales: bool,
+        epsilon: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+        return tex.multi_tensor_compute_scale_and_scale_inv(
+            chunk_size, noop_flag, tensor_lists,
+            max_fp8, force_pow_2_scales, epsilon
+        )
 
+    # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
-        allgather_communicator: Any,
+        allgather_communicator: CommOverlap,
         send_stream: Any,
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
         return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
 
+############## class func #################################
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionHYGON
+        return FlashAttentionHYGON
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
-
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
         intra_node_group: Optional[Any] = None,
-    ) -> Any:
+    ) -> "CommOverlapHelper":
         tex = self._get_tex()
-        if world_group is None:
-            return tex.CommOverlapHelper()
         return tex.CommOverlapHelper(world_group, intra_node_group)
-
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -972,7 +1376,7 @@ def create_comm_overlap(
         set_sm_margin: bool = True,
         atomic_gemm: bool = False,
         rs_overlap_first_gemm: bool = False,
-    ) -> Any:
+    ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
             buffer_shape, buffer_dtype, helper, tp_size,
@@ -980,7 +1384,6 @@ def create_comm_overlap(
             gemm_priority, comm_priority, num_comm_sm,
             set_sm_margin, atomic_gemm, rs_overlap_first_gemm
         )
-
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -997,7 +1400,7 @@ def create_comm_overlap_p2p(
         atomic_gemm: bool = False,
         use_ce: bool = True,
         aggregate: bool = False,
-    ) -> Any:
+    ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
             buffer_shape, buffer_dtype, helper, tp_size, comm_type,
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index 5013fa7c23..294e79fcb9 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -7,8 +7,7 @@
 import math
 import torch
 
-from ....ops import TEFLBackendBase, FP8TensorMeta
-
+from ....ops import *
 
 def _load_iluvatar_libs():
     import ctypes
@@ -105,67 +104,6 @@ def _get_tex():
     import transformer_engine_iluvatar.pytorch.ixte_torch
     return transformer_engine_iluvatar.pytorch.ixte_torch
 
-def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
-    if torch_dtype is None:
-        return None
-
-    NativeDType = tex_module.DType
-    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_iluvatar.pytorch.ixte_torch':
-        return torch_dtype
-
-    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
-        from transformer_engine.plugin.core.ops import DType as PyDType
-        if isinstance(torch_dtype, PyDType):
-            dtype_name = torch_dtype.name
-            if hasattr(NativeDType, dtype_name):
-                return getattr(NativeDType, dtype_name)
-
-    dtype_map = {
-        torch.uint8: NativeDType.kByte,
-        torch.float8_e4m3fn: NativeDType.kFloat8E4M3,
-        torch.float8_e5m2: NativeDType.kFloat8E5M2,
-        torch.int32: NativeDType.kInt32,
-        torch.float32: NativeDType.kFloat32,
-        torch.half: NativeDType.kFloat16,
-        torch.bfloat16: NativeDType.kBFloat16,
-    }
-
-    return dtype_map.get(torch_dtype, torch_dtype)
-
-def _convert_dtype_params(func):
-    import functools
-    import inspect
-    import os
-
-    @functools.wraps(func)
-    def wrapper(self, *args, **kwargs):
-        dtype_params = ['otype', 'output_dtype', 'bias_type']
-
-        from transformer_engine.plugin.core.ops import DType as PyDType
-
-        def needs_conversion(val):
-            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
-
-        for param_name in dtype_params:
-            if param_name in kwargs:
-                value = kwargs[param_name]
-                if needs_conversion(value):
-                    converted = self._to_te_dtype(value)
-                    kwargs[param_name] = converted
-
-        sig = inspect.signature(func)
-        param_names = list(sig.parameters.keys())[1:]
-
-        args_list = list(args)
-        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
-            if param_name in dtype_params and needs_conversion(arg_value):
-                converted = self._to_te_dtype(arg_value)
-                args_list[i] = converted
-
-        return func(self, *args_list, **kwargs)
-
-    return wrapper
-
 class IluvatarBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -179,18 +117,42 @@ def _get_tex(self):
             self._tex = _get_tex()
         return self._tex
 
-    def _to_te_dtype(self, torch_dtype):
-        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
-
     def is_available(self) -> bool:
         return _check_iluvatar_available()
-    
-    def get_flash_attention_class(self):
-        raise NotImplementedError("get_flash_attention_class - not implemented in iluvatar backend")
 
     def get_attention_backend(self, attention_params=None):
-        raise NotImplementedError("get_attention_backend - not implemented in iluvatar backend")
-    
+        from packaging.version import Version as PkgVersion
+        from ....logger_manager import get_logger
+        logger = get_logger()
+
+        # Read environment variables to determine which backends to enable
+        use_flash_attention = int(os.getenv("NVTE_FLASH_ATTN", "1"))
+        use_fused_attention = int(os.getenv("NVTE_FUSED_ATTN", "1"))
+        use_unfused_attention = int(os.getenv("NVTE_UNFUSED_ATTN", "1"))
+
+        # Log disabled backends
+        if not use_flash_attention:
+            logger.info_once("Disabling FlashAttention due to NVTE_FLASH_ATTN=0")
+        if not use_fused_attention:
+            logger.info_once("Disabling FusedAttention due to NVTE_FUSED_ATTN=0")
+        if not use_unfused_attention:
+            logger.info_once("Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0")
+
+        flash_attention_backend = PkgVersion("2.6.0") if use_flash_attention else None
+        fused_attention_backend = NVTE_Fused_Attn_Backend.NVTE_No_Backend
+
+        available_backends = [use_flash_attention, use_fused_attention, use_unfused_attention]
+
+        return (
+            use_flash_attention,
+            flash_attention_backend,
+            use_fused_attention,
+            fused_attention_backend,
+            use_unfused_attention,
+            available_backends,
+        )
+
+##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -201,35 +163,34 @@ def quantize(
         tex = self._get_tex()
         return tex.quantize(tensor, quantizer, output, noop)
 
-    @_convert_dtype_params
     def dequantize(
         self,
-        input: torch.Tensor,
-        otype: torch.dtype,
-    ) -> torch.Tensor:
+        input: Any,
+        otype: DType,
+    ) -> Any:
         tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
         return tex.dequantize(input, otype)
 
     def bgrad_quantize(
         self,
         input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.bgrad_quantize(input, quantizer)
 
-    @_convert_dtype_params
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -238,119 +199,98 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
-        # Check shape
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        if bias_type is None:
-            bias_type = self._to_te_dtype(torch.bfloat16)
-
+        
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
             A, transA, B, transB, D, quantizer, output_dtype,
             bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
             accumulate, use_split_accumulator, comm_overlap, comm_type,
             extra_output, bulk_overlap, alpha, beta
         )
-
-    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.te_general_grouped_gemm(*args, **kwargs)
-
+    # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
-
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
-
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
-
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
-
+    # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
-
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
-
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
-
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
-
+    # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
-
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
-
     def clamped_swiglu(
-            self,
-            input: torch.Tensor,
-            quantizer: Any,
-            limit: float = 7.0,
-            alpha: float = 1.702,
-        ) -> Any:
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
-
+    # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
-
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
-
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
-
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
-
+    # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
-
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
-
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
-
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
-
+    # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
-
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
-
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -361,131 +301,207 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
-
-    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    # DBias + DAct fusions #
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
-
-    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-
-    @_convert_dtype_params
+    # Permutation functions                                                                                                                                                                                                                                                                                             
+    def moe_permute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        indices: torch.Tensor,
+        num_out_tokens: int,
+        workspace: List[torch.Tensor],
+        max_expanded_token_num: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+    def moe_permute_bwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_bwd(
+        self,
+        input_bwd: torch.Tensor,
+        input_fwd: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(
+            output_grads_, softmax_results_, scale_factor
+        )
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(
+            output_grad_, softmax_results_, scale_factor
+        )
+    # Other granular functions
     def layernorm_fwd(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         bias: Optional[torch.Tensor],
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, mu, rsigma = tex.layernorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-        return y, mu, rsigma
-
     def layernorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         mu: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dgamma, dbeta
-
-    @_convert_dtype_params
+        return tex.layernorm_bwd(
+            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
+        )
     def rmsnorm_fwd(
         self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
+        input: Any,
+        weight: Any,
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, y_quant, rsigma = tex.rmsnorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-            if y_quant is not None:
-                y_quant = y_quant.view(*orig_shape)
-        return y, y_quant, rsigma
-
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dw
-
-    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+    def rmsnorm_bwd_add(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        add: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.rmsnorm_bwd_add(*args, **kwargs)
+        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
 
     def multi_tensor_quantize(
         self,
@@ -494,7 +510,6 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
-
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -503,249 +518,457 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
-
-    def moe_permute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex._moe_permute_fwd(*args, **kwargs)
-    
-    def moe_permute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex._moe_permute_bwd(*args, **kwargs)
-
-    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex._moe_unpermute_fwd(*args, **kwargs)
-
-    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex._moe_unpermute_bwd(*args, **kwargs)
-
-    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
-        tex = self._get_tex()
-        return tex.scaled_softmax_forward(input, scale)
-
-    def scaled_softmax_backward(
+    def te_general_grouped_gemm(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
-        tex = self._get_tex()
-        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
-
-    def scaled_masked_softmax_forward(
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
+        tex = self._get_tex()
+        D_type = tex.DType(int(D_type)) if D_type is not None else None
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        return tex.te_general_grouped_gemm(
+            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
+            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
+            accumulate, use_split_accumulator, math_sm_count
+        )
+    def fp8_transpose(
         self,
         input: torch.Tensor,
-        mask: torch.Tensor,
-        scale: float,
+        dtype: DType,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_forward(input, mask, scale)
-
-    def scaled_masked_softmax_backward(
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.fp8_transpose(input, dtype, out)
+    def swap_first_dims(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        tensor: torch.Tensor,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+        return tex.swap_first_dims(tensor, out)
+    def get_fused_attn_backend(
+        self,
+        is_training: bool,
+        q_dtype: DType,
+        kv_dtype: DType,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        p_dropout: float,
+        num_attn_heads: int,
+        num_gqa_groups: int,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        head_dim_qk: int,
+        head_dim_v: int,
+        window_size_left: int,
+        window_size_right: int,
+        return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
+        tex = self._get_tex()
+
+        q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
+        kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        result = tex.get_fused_attn_backend(
+            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
+            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
+            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+            head_dim_v, window_size_left, window_size_right, return_max_logit
+        )
+        return NVTE_Fused_Attn_Backend(result)
 
-    def scaled_upper_triang_masked_softmax_forward(
+    def compute_amax(
         self,
         input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
-
-    def scaled_upper_triang_masked_softmax_backward(
+        return tex.compute_amax(input, amax)
+    def fused_amax_and_scale_update_after_reduction(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax_reduction_buffer: torch.Tensor,
+        amax_histories: List[torch.Tensor],
+        scales: List[torch.Tensor],
+        amax_compute_algo: str,
+        fp8_dtype: DType,
+        margin: float,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)  
-
-    def scaled_aligned_causal_masked_softmax_forward(
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.fused_amax_and_scale_update_after_reduction(
+            amax_reduction_buffer, amax_histories, scales,
+            amax_compute_algo, fp8_dtype, margin
+        )
+    def fp8_block_scaling_compute_partial_amax(
         self,
-        input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
-
-    def scaled_aligned_causal_masked_softmax_backward(
+        return tex.fp8_block_scaling_compute_partial_amax(
+            tensor, amax, h, w, start_offset, block_len
+        )
+    def fp8_block_scaling_partial_cast(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.fp8_block_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+    def fused_multi_row_padding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        padded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
-
-        args_list = list(args)
-
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-
-            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
-                return py_enum
-
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-
-            if hasattr(py_enum, 'value'):
-                enum_value = int(py_enum.value)
-                for member_name in dir(native_enum_class):
-                    if not member_name.startswith('_'):
-                        try:
-                            member = getattr(native_enum_class, member_name)
-                            if hasattr(member, 'value') and int(member.value) == enum_value:
-                                return member
-                        except:
-                            pass
-
-            if hasattr(py_enum, 'value'):
-                return int(py_enum.value)
-
-            return py_enum
-
-        if len(args) > 1:
-            args_list[1] = self._to_te_dtype(args[1])
-        if len(args) > 2:
-            args_list[2] = self._to_te_dtype(args[2])
-        if len(args) > 3:
-            args_list[3] = convert_enum(args[3], tex.NVTE_QKV_Layout)
-        if len(args) > 4:
-            args_list[4] = convert_enum(args[4], tex.NVTE_Bias_Type)
-        if len(args) > 5:
-            args_list[5] = convert_enum(args[5], tex.NVTE_Mask_Type)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_Softmax_Type)
-
-        return tex.get_fused_attn_backend(*args_list, **kwargs)
-
-    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        return tex.fused_multi_row_padding(
+            input, output, input_row_list, padded_input_row_list
+        )
+    def fused_multi_row_unpadding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        unpadded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(
+            input, output, input_row_list, unpadded_input_row_list
+        )
 
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
-                return py_enum
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-            return py_enum
-
-        args_list = list(args)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_QKV_Layout)
-        if len(args) > 7:
-            args_list[7] = convert_enum(args[7], tex.NVTE_Bias_Type)
-        if len(args) > 8:
-            args_list[8] = convert_enum(args[8], tex.NVTE_Mask_Type)
-        if len(args) > 9:
-            args_list[9] = convert_enum(args[9], tex.NVTE_Softmax_Type)
-
-        return tex.fused_attn_fwd(*args_list, **kwargs)
-
-    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+    # attention kernels
+    def fa_prepare_fwd(
+        self,
+        qkvi: torch.Tensor,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-            if type(py_enum).__module__ == 'transformer_engine_torch_nv':
-                return py_enum
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-            return py_enum
-
-        args_list = list(args)
-        if len(args) > 5:
-            args_list[5] = convert_enum(args[5], tex.NVTE_QKV_Layout)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_Bias_Type)
-        if len(args) > 7:
-            args_list[7] = convert_enum(args[7], tex.NVTE_Mask_Type)
-        if len(args) > 8:
-            args_list[8] = convert_enum(args[8], tex.NVTE_Softmax_Type)
-        if len(args) > 19:
-            args_list[19] = self._to_te_dtype(args[19])
-
-        if 'dqkv_dtype' in kwargs:
-            kwargs['dqkv_dtype'] = self._to_te_dtype(kwargs['dqkv_dtype'])
-
-        return tex.fused_attn_bwd(*args_list, **kwargs)
-
-    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_fwd(qkvi)
+    def fa_prepare_bwd(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fa_prepare_fwd(*args, **kwargs)
-
-    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_bwd(q, k, v)
+    def fused_attn_fwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        is_training: bool,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        fake_dtype: torch.dtype,
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        page_table_k: Optional[torch.Tensor],
+        page_table_v: Optional[torch.Tensor],
+        s_quantizer: Any,
+        o_quantizer: Any,
+        Bias: Optional[torch.Tensor],
+        SoftmaxOffset: Optional[torch.Tensor],
+        rng_gen: Optional[torch.Generator],
+        rng_elts_per_thread: int,
+        return_max_logit: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.fa_prepare_bwd(*args, **kwargs)
 
-    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        return tex.fused_attn_fwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            is_training,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            fake_dtype,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            page_table_k,
+            page_table_v,
+            s_quantizer,
+            o_quantizer,
+            Bias,
+            SoftmaxOffset,
+            rng_gen,
+            rng_elts_per_thread,
+            return_max_logit
+        )
+    def fused_attn_bwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        deterministic: bool,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        O: Any,
+        dO: Any,
+        fake_dtype: torch.dtype,
+        dqkv_type: DType,
+        Aux_CTX_Tensors: List[torch.Tensor],
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        s_quantizer: Any,
+        dp_quantizer: Any,
+        dqkv_quantizer: Any,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.copy_to_kv_cache(*args, **kwargs)
 
-    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
+
+        return tex.fused_attn_bwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            deterministic,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            O,
+            dO,
+            fake_dtype,
+            dqkv_type,
+            Aux_CTX_Tensors,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            s_quantizer,
+            dp_quantizer,
+            dqkv_quantizer
+        )
+    def copy_to_kv_cache(
+        self,
+        new_k: torch.Tensor,
+        new_v: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        page_table: torch.Tensor,
+        cu_new_lens: torch.Tensor,
+        cu_cached_lens: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        b: int,
+        max_ctx_len: int,
+        max_seq_len: int,
+        max_pages_per_seq: int,
+        is_non_paged: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.convert_thd_to_bshd(*args, **kwargs)
-
-    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.copy_to_kv_cache(
+            new_k,
+            new_v,
+            k_cache,
+            v_cache,
+            page_table,
+            cu_new_lens,
+            cu_cached_lens,
+            qkv_format,
+            b,
+            max_ctx_len,
+            max_seq_len,
+            max_pages_per_seq,
+            is_non_paged
+        )
+    def convert_thd_to_bshd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        b: int,
+        max_seq_len: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.convert_bshd_to_thd(*args, **kwargs)
-
-    def fused_rope_forward(self, *args, **kwargs) -> Any:
-        assert args[2] is None, "[Iluvatar] fused_rope_forward does not support start_position now."
-        assert args[3].name == "NVTE_SBHD", f"[Iluvatar] fused_rope_forward expect NVTE_SBHD, but got {args[3].name}."
+        return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+    def convert_bshd_to_thd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        t: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_forward(args[0], args[1], False, False, 1.0)
+        return tex.convert_bshd_to_thd(tensor, cu_seqlens, t)
 
-    def fused_rope_backward(self, *args, **kwargs) -> Any:
-        assert args[2].name == "NVTE_SBHD", f"[Iluvatar] fused_rope_backward expect NVTE_SBHD, but got {args[2].name}."
+    # fused apply rope
+    def fused_rope_forward(
+        self,
+        input: torch.Tensor,
+        freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_backward(args[0], args[1], False, False, 1.0)
-
-    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_forward(
+            input, freqs, start_positions, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_rope_backward(
+        self,
+        output_grads: torch.Tensor,
+        freqs: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_forward(*args, **kwargs)
-
-    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_backward(
+            output_grads, freqs, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_qkv_rope_forward(
+        self,
+        qkv_input: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_forward(
+            qkv_input, q_freqs, k_freqs, start_positions,
+            qkv_split_arg_list, qkv_format, interleaved,
+            cp_size, cp_rank
+        )
+    def fused_qkv_rope_backward(
+        self,
+        q_grad_out: torch.Tensor,
+        k_grad_out: torch.Tensor,
+        v_grad_out: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_backward(*args, **kwargs)
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_backward(
+            q_grad_out, k_grad_out, v_grad_out,
+            q_freqs, k_freqs, qkv_split_arg_list,
+            qkv_format, interleaved, cp_size, cp_rank
+        )
 
+    # fused router
     def fused_topk_with_score_function_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        num_groups: int,
-        group_topk: int,
-        scaling_factor: float,
-        score_function: Any,
+        num_groups: Optional[int],
+        group_topk: Optional[int],
+        scaling_factor: Optional[float],
+        score_function: str,
         expert_bias: Optional[torch.Tensor],
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_fwd(
-            logits, topk, use_pre_softmax, num_groups, group_topk,
-            scaling_factor, score_function, expert_bias
+            logits,
+            topk,
+            use_pre_softmax,
+            num_groups,
+            group_topk,
+            scaling_factor,
+            score_function,
+            expert_bias,
         )
-
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -755,24 +978,33 @@ def fused_topk_with_score_function_bwd(
         grad_probs: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        scaling_factor: float,
-        score_function: Any,
-    ) -> Any:
+        scaling_factor: Optional[float],
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_bwd(
-            num_tokens, num_experts, routing_map, intermediate_output,
-            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+            num_tokens,
+            num_experts,
+            routing_map,
+            intermediate_output,
+            grad_probs,
+            topk,
+            use_pre_softmax,
+            scaling_factor,
+            score_function,
         )
-
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
-
+        return tex.fused_score_for_moe_aux_loss_fwd(
+            logits,
+            topk,
+            score_function,
+        )
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -780,13 +1012,17 @@ def fused_score_for_moe_aux_loss_bwd(
         intermediate_output: torch.Tensor,
         grad_scores: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_score_for_moe_aux_loss_bwd(
-            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+            num_tokens,
+            num_experts,
+            intermediate_output,
+            grad_scores,
+            topk,
+            score_function,
         )
-
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -797,13 +1033,18 @@ def fused_moe_aux_loss_fwd(
         num_cols: int,
         topk: int,
         coeff: float,
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_moe_aux_loss_fwd(
-            probs, tokens_per_expert, total_num_tokens, num_experts,
-            num_rows, num_cols, topk, coeff
+            probs,
+            tokens_per_expert,
+            total_num_tokens,
+            num_experts,
+            num_rows,
+            num_cols,
+            topk,
+            coeff,
         )
-
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -811,152 +1052,146 @@ def fused_moe_aux_loss_bwd(
         num_rows: int,
         num_cols: int,
         grad_aux_loss: torch.Tensor,
-    ) -> Any:
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(
-            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
-        )
+        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
 
+    # Dropout
     def dropout_fwd(
         self,
         input: torch.Tensor,
         dropout_probability: float,
-        out: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
-
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
         mask: torch.Tensor,
         dropout_probability: float,
-        grad_input: Optional[torch.Tensor] = None,
+        grad_input: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
 
-    def fp8_transpose(
-        self,
-        input: torch.Tensor,
-        dtype: Any,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_transpose(input, dtype, out=out)
-
-    def swap_first_dims(
-        self,
-        tensor: torch.Tensor,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.swap_first_dims(tensor, out=out)
-
-    def compute_amax(
-        self,
-        input: torch.Tensor,
-        amax: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.compute_amax(input, amax)
-
-    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
-        tex = self._get_tex()
-        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
-
-    def fp8_block_scaling_compute_partial_amax(
-        self,
-        tensor: torch.Tensor,
-        amax: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
-
-    def fp8_block_scaling_partial_cast(
-        self,
-        inp: torch.Tensor,
-        out: torch.Tensor,
-        scale: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-        out_dtype: Any,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
-
-    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_padding(*args, **kwargs)
-
-    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(*args, **kwargs)
-
+    # Misc
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
-
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
-
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
 
-    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+    # Support THD format for Context Parallel
+    def thd_read_half_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        half_idx: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_half_tensor(*args, **kwargs)
-
-    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+    def thd_second_half_lse_correction(
+        self,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(*args, **kwargs)
-
-    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        return tex.thd_second_half_lse_correction(
+            lse, lse_per_step, cu_seqlens, lse_packed
+        )
+    def thd_read_second_half_lse(
+        self,
+        lse: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+        second_half_lse_seqlen: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(*args, **kwargs)
-
-    def thd_out_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_read_second_half_lse(
+            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
+        )
+    def thd_out_correction(
+        self,
+        out: torch.Tensor,
+        out_per_step: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        only_second_half: bool,
+        lse_packed: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_out_correction(*args, **kwargs)
-
-    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_out_correction(
+            out, out_per_step, lse, lse_per_step,
+            cu_seqlens, only_second_half, lse_packed
+        )
+    def thd_grad_correction(
+        self,
+        grad: torch.Tensor,
+        grad_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        first_half: str,
+        second_half: str,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(*args, **kwargs)
-
-    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        return tex.thd_grad_correction(
+            grad, grad_per_step, cu_seqlens,
+            first_half, second_half
+        )
+    def thd_get_partitioned_indices(
+        self,
+        cu_seqlens: torch.Tensor,
+        total_tokens: int,
+        world_size: int,
+        rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(*args, **kwargs)
+        return tex.thd_get_partitioned_indices(
+            cu_seqlens, total_tokens, world_size, rank
+        )
 
-    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+    # nvshmem functions
+    def init_nvshmem_backend(
+        self,
+        process_group: Any,
+    ) -> None:
         tex = self._get_tex()
-        tex.init_nvshmem_backend(*args, **kwargs)
-
-    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        return tex.init_nvshmem_backend(process_group)
+    def create_nvshmem_tensor(
+        self,
+        shape: List[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.create_nvshmem_tensor(*args, **kwargs)
-
-    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        return tex.create_nvshmem_tensor(shape, dtype)
+    def nvshmem_send_on_current_stream(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        peer: int,
+        signal: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.nvshmem_send_on_current_stream(*args, **kwargs)
-
-    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+    def nvshmem_wait_on_current_stream(
+        self,
+        signal: torch.Tensor,
+        wait_kind: str,
+    ) -> None:
         tex = self._get_tex()
-        tex.nvshmem_wait_on_current_stream(*args, **kwargs)
-
+        return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
-        tex.nvshmem_finalize()
+        return tex.nvshmem_finalize()
 
+    # multi-tensor functions
     def multi_tensor_scale(
         self,
         chunk_size: int,
@@ -965,98 +1200,194 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
-
+        return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
-
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        scale: torch.Tensor,
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
-
+        return tex.multi_tensor_unscale_l2norm(
+            chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
+        )
     def multi_tensor_adam(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
-        tex = self._get_tex()
-        if chunk_size is None:
-            return tex.multi_tensor_adam
-        tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
-            eps, step, mode, bias_correction, weight_decay
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
         )
-
-    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
-
-    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_param_remainder(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
+        )
+    def multi_tensor_adam_fp8(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        fp8_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_fp8(*args, **kwargs)
-
-    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.multi_tensor_adam_fp8(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            fp8_dtype
+        )
+    def multi_tensor_adam_capturable(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable(*args, **kwargs)
-
-    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_adam_capturable_master(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
-
-    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable_master(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_sgd(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_sgd(*args, **kwargs)
-
-    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_sgd(
+            chunk_size, noop_flag, tensor_lists,
+            wd, momentum, dampening,
+            lr, nesterov, first_run,
+            wd_after_momentum, scale
+        )
+    def multi_tensor_compute_scale_and_scale_inv(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        max_fp8: float,
+        force_pow_2_scales: bool,
+        epsilon: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+        return tex.multi_tensor_compute_scale_and_scale_inv(
+            chunk_size, noop_flag, tensor_lists,
+            max_fp8, force_pow_2_scales, epsilon
+        )
 
+    # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
-        allgather_communicator: Any,
+        allgather_communicator: CommOverlap,
         send_stream: Any,
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
         return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
 
+############## class func #################################
+    def get_flash_attention_class(self):
+        raise NotImplementedError("get_flash_attention_class - not implemented in iluvatar backend")
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
-
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
         intra_node_group: Optional[Any] = None,
-    ) -> Any:
+    ) -> "CommOverlapHelper":
         tex = self._get_tex()
-        if world_group is None:
-            return tex.CommOverlapHelper()
         return tex.CommOverlapHelper(world_group, intra_node_group)
-
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1072,7 +1403,7 @@ def create_comm_overlap(
         set_sm_margin: bool = True,
         atomic_gemm: bool = False,
         rs_overlap_first_gemm: bool = False,
-    ) -> Any:
+    ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
             buffer_shape, buffer_dtype, helper, tp_size,
@@ -1080,7 +1411,6 @@ def create_comm_overlap(
             gemm_priority, comm_priority, num_comm_sm,
             set_sm_margin, atomic_gemm, rs_overlap_first_gemm
         )
-
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1097,13 +1427,10 @@ def create_comm_overlap_p2p(
         atomic_gemm: bool = False,
         use_ce: bool = True,
         aggregate: bool = False,
-    ) -> Any:
+    ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
             buffer_shape, buffer_dtype, helper, tp_size, comm_type,
             num_max_streams, comm_cga_size, gemm_priority, comm_priority,
             num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
         )
-
-    
-    
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
index 6066a53892..9d9bb164fa 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
@@ -5,10 +5,8 @@
 import os
 import subprocess
 from typing import Any, Dict, List, Optional, Tuple, Union
-
 import torch
-
-from transformer_engine.plugin.core.ops import TEFLBackendBase, FP8TensorMeta, NVTE_Fused_Attn_Backend
+from ....ops import *
 
 _kunlunxin_available = False
 
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 8efbbc9490..6b33369c75 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -14,7 +14,7 @@
 
 import torch
 
-from ....ops import TEFLBackendBase, FP8TensorMeta
+from ....ops import *
 
 def _load_metax_libs():
 
@@ -74,67 +74,6 @@ def _get_tex():
     import transformer_engine_torch_metax
     return transformer_engine_torch_metax
 
-def _torch_dtype_to_te_dtype(torch_dtype, tex_module):
-    if torch_dtype is None:
-        return None
-
-    NativeDType = tex_module.DType
-    if type(torch_dtype).__name__ == 'DType' and type(torch_dtype).__module__ == 'transformer_engine_torch_metax':
-        return torch_dtype
-
-    if hasattr(torch_dtype, 'name') and hasattr(torch_dtype, 'value'):
-        from transformer_engine.plugin.core.ops import DType as PyDType
-        if isinstance(torch_dtype, PyDType):
-            dtype_name = torch_dtype.name
-            if hasattr(NativeDType, dtype_name):
-                return getattr(NativeDType, dtype_name)
-
-    dtype_map = {
-        torch.float32: NativeDType.kFloat32,
-        torch.float16: NativeDType.kFloat16,
-        torch.bfloat16: NativeDType.kBFloat16,
-        torch.int32: NativeDType.kInt32,
-        torch.uint8: NativeDType.kByte,
-    }
-
-    if hasattr(torch, 'float8_e4m3fn'):
-        dtype_map[torch.float8_e4m3fn] = NativeDType.kFloat8E4M3
-    if hasattr(torch, 'float8_e5m2'):
-        dtype_map[torch.float8_e5m2] = NativeDType.kFloat8E5M2
-
-    return dtype_map.get(torch_dtype, torch_dtype)
-
-def _convert_dtype_params(func):
-
-    @functools.wraps(func)
-    def wrapper(self, *args, **kwargs):
-        dtype_params = ['otype', 'output_dtype', 'bias_type']
-
-        from transformer_engine.plugin.core.ops import DType as PyDType
-
-        def needs_conversion(val):
-            return isinstance(val, torch.dtype) or isinstance(val, PyDType)
-
-        for param_name in dtype_params:
-            if param_name in kwargs:
-                value = kwargs[param_name]
-                if needs_conversion(value):
-                    converted = self._to_te_dtype(value)
-                    kwargs[param_name] = converted
-
-        sig = inspect.signature(func)
-        param_names = list(sig.parameters.keys())[1:]
-
-        args_list = list(args)
-        for i, (param_name, arg_value) in enumerate(zip(param_names, args_list)):
-            if param_name in dtype_params and needs_conversion(arg_value):
-                converted = self._to_te_dtype(arg_value)
-                args_list[i] = converted
-
-        return func(self, *args_list, **kwargs)
-
-    return wrapper
-
 class MetaxBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -148,16 +87,9 @@ def _get_tex(self):
             self._tex = _get_tex()
         return self._tex
 
-    def _to_te_dtype(self, torch_dtype):
-        return _torch_dtype_to_te_dtype(torch_dtype, self._get_tex())
-
     def is_available(self) -> bool:
         return _check_metax_available()
 
-    def get_flash_attention_class(self):
-        from .flash_attention import FlashAttentionMETAX
-        return FlashAttentionMETAX
-
     def get_attention_backend(self, attention_params=None):
         # Import the metax get_attention_backend function
         try:
@@ -175,6 +107,7 @@ def get_attention_backend(self, attention_params=None):
                 f"Attention_params: {self.attention_params}"
             )
 
+##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -185,35 +118,34 @@ def quantize(
         tex = self._get_tex()
         return tex.quantize(tensor, quantizer, output, noop)
 
-    @_convert_dtype_params
     def dequantize(
         self,
-        input: torch.Tensor,
-        otype: torch.dtype,
-    ) -> torch.Tensor:
+        input: Any,
+        otype: DType,
+    ) -> Any:
         tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
         return tex.dequantize(input, otype)
 
     def bgrad_quantize(
         self,
         input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.bgrad_quantize(input, quantizer)
 
-    @_convert_dtype_params
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -222,61 +154,53 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        if bias_type is None:
-            bias_type = self._to_te_dtype(torch.bfloat16)
-
+        
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
             A, transA, B, transB, D, quantizer, output_dtype,
             bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
             accumulate, use_split_accumulator, comm_overlap, comm_type,
             extra_output, bulk_overlap, alpha, beta
         )
-
-    def te_general_grouped_gemm(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.te_general_grouped_gemm(*args, **kwargs)
-
+    # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
-
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
-
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
+    # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
-
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
-
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
-
+    # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
-
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
@@ -289,42 +213,39 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
-
+    # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
-
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
-
+    # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
-
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
-
+    # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
-
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -335,131 +256,207 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
-
-    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    # DBias + DAct fusions #
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
-
-    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Tuple[torch.Tensor, Any]:
+    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-
-    @_convert_dtype_params
+    # Permutation functions                                                                                                                                                                                                                                                                                             
+    def moe_permute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        indices: torch.Tensor,
+        num_out_tokens: int,
+        workspace: List[torch.Tensor],
+        max_expanded_token_num: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+    def moe_permute_bwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+    def moe_unpermute_bwd(
+        self,
+        input_bwd: torch.Tensor,
+        input_fwd: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(
+            output_grads_, softmax_results_, scale_factor
+        )
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(
+            output_grad_, softmax_results_, scale_factor
+        )
+    # Other granular functions
     def layernorm_fwd(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
         bias: Optional[torch.Tensor],
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, mu, rsigma = tex.layernorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-        return y, mu, rsigma
-
     def layernorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         mu: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dgamma, dbeta = tex.layernorm_bwd(dy, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dgamma, dbeta
-
-    @_convert_dtype_params
+        return tex.layernorm_bwd(
+            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
+        )
     def rmsnorm_fwd(
         self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
+        input: Any,
+        weight: Any,
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = input.shape
-        if input.ndim > 2:
-            input = input.view(-1, input.shape[-1])
-
-        y, y_quant, rsigma = tex.rmsnorm_fwd(
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
-
-        if len(orig_shape) > 2:
-            y = y.view(*orig_shape)
-            if y_quant is not None:
-                y_quant = y_quant.view(*orig_shape)
-        return y, y_quant, rsigma
-
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-
-        orig_shape = dy.shape
-        if dy.ndim > 2:
-            dy = dy.view(-1, dy.shape[-1])
-            x = x.view(-1, x.shape[-1])
-
-        dx, dw = tex.rmsnorm_bwd(dy, x, rsigma, gamma, sm_margin, zero_centered_gamma)
-
-        if len(orig_shape) > 2:
-            dx = dx.view(*orig_shape)
-        return dx, dw
-
-    def rmsnorm_bwd_add(self, *args, **kwargs) -> Any:
+        return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+    def rmsnorm_bwd_add(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        add: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.rmsnorm_bwd_add(*args, **kwargs)
+        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
 
     def multi_tensor_quantize(
         self,
@@ -468,7 +465,6 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
-
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -477,246 +473,457 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
-
-    def moe_permute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_permute_fwd(*args, **kwargs)
-
-    def moe_permute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_permute_bwd(*args, **kwargs)
-
-    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_unpermute_fwd(*args, **kwargs)
-
-    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.moe_unpermute_bwd(*args, **kwargs)
-
-    def scaled_softmax_forward(self, input: torch.Tensor, scale: float) -> torch.Tensor:
-        tex = self._get_tex()
-        return tex.scaled_softmax_forward(input, scale)
-
-    def scaled_softmax_backward(
+    def te_general_grouped_gemm(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
-        tex = self._get_tex()
-        return tex.scaled_softmax_backward(output_grad, softmax_output, scale)
-
-    def scaled_masked_softmax_forward(
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
+        tex = self._get_tex()
+        D_type = tex.DType(int(D_type)) if D_type is not None else None
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        return tex.te_general_grouped_gemm(
+            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
+            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
+            accumulate, use_split_accumulator, math_sm_count
+        )
+    def fp8_transpose(
         self,
         input: torch.Tensor,
-        mask: torch.Tensor,
-        scale: float,
+        dtype: DType,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_forward(input, mask, scale)
-
-    def scaled_masked_softmax_backward(
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.fp8_transpose(input, dtype, out)
+    def swap_first_dims(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        tensor: torch.Tensor,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.scaled_masked_softmax_backward(output_grad, softmax_output, scale)
+        return tex.swap_first_dims(tensor, out)
+    def get_fused_attn_backend(
+        self,
+        is_training: bool,
+        q_dtype: DType,
+        kv_dtype: DType,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        p_dropout: float,
+        num_attn_heads: int,
+        num_gqa_groups: int,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        head_dim_qk: int,
+        head_dim_v: int,
+        window_size_left: int,
+        window_size_right: int,
+        return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
+        tex = self._get_tex()
+
+        q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
+        kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        result = tex.get_fused_attn_backend(
+            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
+            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
+            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
+            head_dim_v, window_size_left, window_size_right, return_max_logit
+        )
+        return NVTE_Fused_Attn_Backend(result)
 
-    def scaled_upper_triang_masked_softmax_forward(
+    def compute_amax(
         self,
         input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_forward(input, scale)
-
-    def scaled_upper_triang_masked_softmax_backward(
+        return tex.compute_amax(input, amax)
+    def fused_amax_and_scale_update_after_reduction(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax_reduction_buffer: torch.Tensor,
+        amax_histories: List[torch.Tensor],
+        scales: List[torch.Tensor],
+        amax_compute_algo: str,
+        fp8_dtype: DType,
+        margin: float,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_upper_triang_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def scaled_aligned_causal_masked_softmax_forward(
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.fused_amax_and_scale_update_after_reduction(
+            amax_reduction_buffer, amax_histories, scales,
+            amax_compute_algo, fp8_dtype, margin
+        )
+    def fp8_block_scaling_compute_partial_amax(
         self,
-        input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale)
-
-    def scaled_aligned_causal_masked_softmax_backward(
+        return tex.fp8_block_scaling_compute_partial_amax(
+            tensor, amax, h, w, start_offset, block_len
+        )
+    def fp8_block_scaling_partial_cast(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        return tex.scaled_aligned_causal_masked_softmax_backward(output_grad, softmax_output, scale)
-
-    def get_fused_attn_backend(self, *args, **kwargs) -> int:
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.fp8_block_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+    def fused_multi_row_padding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        padded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
-
-        args_list = list(args)
-
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-
-            if type(py_enum).__module__ == 'transformer_engine_torch_metax':
-                return py_enum
-
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-
-            if hasattr(py_enum, 'value'):
-                enum_value = int(py_enum.value)
-                for member_name in dir(native_enum_class):
-                    if not member_name.startswith('_'):
-                        try:
-                            member = getattr(native_enum_class, member_name)
-                            if hasattr(member, 'value') and int(member.value) == enum_value:
-                                return member
-                        except:
-                            pass
-
-            if hasattr(py_enum, 'value'):
-                return int(py_enum.value)
-
-            return py_enum
-
-        if len(args) > 1:
-            args_list[1] = self._to_te_dtype(args[1])
-        if len(args) > 2:
-            args_list[2] = self._to_te_dtype(args[2])
-        if len(args) > 3:
-            args_list[3] = convert_enum(args[3], tex.NVTE_QKV_Layout)
-        if len(args) > 4:
-            args_list[4] = convert_enum(args[4], tex.NVTE_Bias_Type)
-        if len(args) > 5:
-            args_list[5] = convert_enum(args[5], tex.NVTE_Mask_Type)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_Softmax_Type)
-
-        return tex.get_fused_attn_backend(*args_list, **kwargs)
-
-    def fused_attn_fwd(self, *args, **kwargs) -> Any:
+        return tex.fused_multi_row_padding(
+            input, output, input_row_list, padded_input_row_list
+        )
+    def fused_multi_row_unpadding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        unpadded_input_row_list: List[int],
+    ) -> None:
         tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(
+            input, output, input_row_list, unpadded_input_row_list
+        )
 
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-            if type(py_enum).__module__ == 'transformer_engine_torch_metax':
-                return py_enum
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-            return py_enum
-
-        args_list = list(args)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_QKV_Layout)
-        if len(args) > 7:
-            args_list[7] = convert_enum(args[7], tex.NVTE_Bias_Type)
-        if len(args) > 8:
-            args_list[8] = convert_enum(args[8], tex.NVTE_Mask_Type)
-        if len(args) > 9:
-            args_list[9] = convert_enum(args[9], tex.NVTE_Softmax_Type)
-
-        return tex.fused_attn_fwd(*args_list, **kwargs)
-
-    def fused_attn_bwd(self, *args, **kwargs) -> Any:
+    # attention kernels
+    def fa_prepare_fwd(
+        self,
+        qkvi: torch.Tensor,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-
-        def convert_enum(py_enum, native_enum_class):
-            if py_enum is None:
-                return None
-            if type(py_enum).__module__ == 'transformer_engine_torch_metax':
-                return py_enum
-            if hasattr(py_enum, 'name'):
-                enum_name = py_enum.name
-                if hasattr(native_enum_class, enum_name):
-                    return getattr(native_enum_class, enum_name)
-            return py_enum
-
-        args_list = list(args)
-        if len(args) > 5:
-            args_list[5] = convert_enum(args[5], tex.NVTE_QKV_Layout)
-        if len(args) > 6:
-            args_list[6] = convert_enum(args[6], tex.NVTE_Bias_Type)
-        if len(args) > 7:
-            args_list[7] = convert_enum(args[7], tex.NVTE_Mask_Type)
-        if len(args) > 8:
-            args_list[8] = convert_enum(args[8], tex.NVTE_Softmax_Type)
-        if len(args) > 19:
-            args_list[19] = self._to_te_dtype(args[19])
-
-        if 'dqkv_dtype' in kwargs:
-            kwargs['dqkv_dtype'] = self._to_te_dtype(kwargs['dqkv_dtype'])
-
-        return tex.fused_attn_bwd(*args_list, **kwargs)
-
-    def fa_prepare_fwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_fwd(qkvi)
+    def fa_prepare_bwd(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fa_prepare_fwd(*args, **kwargs)
-
-    def fa_prepare_bwd(self, *args, **kwargs) -> Any:
+        return tex.fa_prepare_bwd(q, k, v)
+    def fused_attn_fwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        is_training: bool,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        fake_dtype: torch.dtype,
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        page_table_k: Optional[torch.Tensor],
+        page_table_v: Optional[torch.Tensor],
+        s_quantizer: Any,
+        o_quantizer: Any,
+        Bias: Optional[torch.Tensor],
+        SoftmaxOffset: Optional[torch.Tensor],
+        rng_gen: Optional[torch.Generator],
+        rng_elts_per_thread: int,
+        return_max_logit: bool,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.fa_prepare_bwd(*args, **kwargs)
 
-    def copy_to_kv_cache(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+
+        return tex.fused_attn_fwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            is_training,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            fake_dtype,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            page_table_k,
+            page_table_v,
+            s_quantizer,
+            o_quantizer,
+            Bias,
+            SoftmaxOffset,
+            rng_gen,
+            rng_elts_per_thread,
+            return_max_logit
+        )
+    def fused_attn_bwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        deterministic: bool,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        O: Any,
+        dO: Any,
+        fake_dtype: torch.dtype,
+        dqkv_type: DType,
+        Aux_CTX_Tensors: List[torch.Tensor],
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        s_quantizer: Any,
+        dp_quantizer: Any,
+        dqkv_quantizer: Any,
+    ) -> List[Any]:
         tex = self._get_tex()
-        return tex.copy_to_kv_cache(*args, **kwargs)
 
-    def convert_thd_to_bshd(self, *args, **kwargs) -> Any:
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
+
+        return tex.fused_attn_bwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            deterministic,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            O,
+            dO,
+            fake_dtype,
+            dqkv_type,
+            Aux_CTX_Tensors,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            s_quantizer,
+            dp_quantizer,
+            dqkv_quantizer
+        )
+    def copy_to_kv_cache(
+        self,
+        new_k: torch.Tensor,
+        new_v: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        page_table: torch.Tensor,
+        cu_new_lens: torch.Tensor,
+        cu_cached_lens: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        b: int,
+        max_ctx_len: int,
+        max_seq_len: int,
+        max_pages_per_seq: int,
+        is_non_paged: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.convert_thd_to_bshd(*args, **kwargs)
-
-    def convert_bshd_to_thd(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.copy_to_kv_cache(
+            new_k,
+            new_v,
+            k_cache,
+            v_cache,
+            page_table,
+            cu_new_lens,
+            cu_cached_lens,
+            qkv_format,
+            b,
+            max_ctx_len,
+            max_seq_len,
+            max_pages_per_seq,
+            is_non_paged
+        )
+    def convert_thd_to_bshd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        b: int,
+        max_seq_len: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.convert_bshd_to_thd(*args, **kwargs)
-
-    def fused_rope_forward(self, *args, **kwargs) -> Any:
+        return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+    def convert_bshd_to_thd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        t: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_forward(*args, **kwargs)
+        return tex.convert_bshd_to_thd(tensor, cu_seqlens, t)
 
-    def fused_rope_backward(self, *args, **kwargs) -> Any:
+    # fused apply rope
+    def fused_rope_forward(
+        self,
+        input: torch.Tensor,
+        freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_rope_backward(*args, **kwargs)
-
-    def fused_qkv_rope_forward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_forward(
+            input, freqs, start_positions, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_rope_backward(
+        self,
+        output_grads: torch.Tensor,
+        freqs: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_forward(*args, **kwargs)
-
-    def fused_qkv_rope_backward(self, *args, **kwargs) -> Any:
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_backward(
+            output_grads, freqs, qkv_format,
+            interleaved, cu_seqlens, cp_size, cp_rank
+        )
+    def fused_qkv_rope_forward(
+        self,
+        qkv_input: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_forward(
+            qkv_input, q_freqs, k_freqs, start_positions,
+            qkv_split_arg_list, qkv_format, interleaved,
+            cp_size, cp_rank
+        )
+    def fused_qkv_rope_backward(
+        self,
+        q_grad_out: torch.Tensor,
+        k_grad_out: torch.Tensor,
+        v_grad_out: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_qkv_rope_backward(*args, **kwargs)
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_backward(
+            q_grad_out, k_grad_out, v_grad_out,
+            q_freqs, k_freqs, qkv_split_arg_list,
+            qkv_format, interleaved, cp_size, cp_rank
+        )
 
+    # fused router
     def fused_topk_with_score_function_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        num_groups: int,
-        group_topk: int,
-        scaling_factor: float,
-        score_function: Any,
+        num_groups: Optional[int],
+        group_topk: Optional[int],
+        scaling_factor: Optional[float],
+        score_function: str,
         expert_bias: Optional[torch.Tensor],
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_fwd(
-            logits, topk, use_pre_softmax, num_groups, group_topk,
-            scaling_factor, score_function, expert_bias
+            logits,
+            topk,
+            use_pre_softmax,
+            num_groups,
+            group_topk,
+            scaling_factor,
+            score_function,
+            expert_bias,
         )
-
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -726,24 +933,33 @@ def fused_topk_with_score_function_bwd(
         grad_probs: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        scaling_factor: float,
-        score_function: Any,
-    ) -> Any:
+        scaling_factor: Optional[float],
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_topk_with_score_function_bwd(
-            num_tokens, num_experts, routing_map, intermediate_output,
-            grad_probs, topk, use_pre_softmax, scaling_factor, score_function
+            num_tokens,
+            num_experts,
+            routing_map,
+            intermediate_output,
+            grad_probs,
+            topk,
+            use_pre_softmax,
+            scaling_factor,
+            score_function,
         )
-
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.fused_score_for_moe_aux_loss_fwd(logits, topk, score_function)
-
+        return tex.fused_score_for_moe_aux_loss_fwd(
+            logits,
+            topk,
+            score_function,
+        )
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -751,13 +967,17 @@ def fused_score_for_moe_aux_loss_bwd(
         intermediate_output: torch.Tensor,
         grad_scores: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fused_score_for_moe_aux_loss_bwd(
-            num_tokens, num_experts, intermediate_output, grad_scores, topk, score_function
+            num_tokens,
+            num_experts,
+            intermediate_output,
+            grad_scores,
+            topk,
+            score_function,
         )
-
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -768,13 +988,18 @@ def fused_moe_aux_loss_fwd(
         num_cols: int,
         topk: int,
         coeff: float,
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.fused_moe_aux_loss_fwd(
-            probs, tokens_per_expert, total_num_tokens, num_experts,
-            num_rows, num_cols, topk, coeff
+            probs,
+            tokens_per_expert,
+            total_num_tokens,
+            num_experts,
+            num_rows,
+            num_cols,
+            topk,
+            coeff,
         )
-
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -782,152 +1007,146 @@ def fused_moe_aux_loss_bwd(
         num_rows: int,
         num_cols: int,
         grad_aux_loss: torch.Tensor,
-    ) -> Any:
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(
-            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
-        )
+        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
 
+    # Dropout
     def dropout_fwd(
         self,
         input: torch.Tensor,
         dropout_probability: float,
-        out: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
-
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
         mask: torch.Tensor,
         dropout_probability: float,
-        grad_input: Optional[torch.Tensor] = None,
+        grad_input: Optional[torch.Tensor],
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
 
-    def fp8_transpose(
-        self,
-        input: torch.Tensor,
-        dtype: Any,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_transpose(input, dtype, out=out)
-
-    def swap_first_dims(
-        self,
-        tensor: torch.Tensor,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.swap_first_dims(tensor, out=out)
-
-    def compute_amax(
-        self,
-        input: torch.Tensor,
-        amax: torch.Tensor,
-    ) -> None:
-        tex = self._get_tex()
-        tex.compute_amax(input, amax)
-
-    def fused_amax_and_scale_update_after_reduction(self, *args, **kwargs) -> None:
-        tex = self._get_tex()
-        tex.fused_amax_and_scale_update_after_reduction(*args, **kwargs)
-
-    def fp8_block_scaling_compute_partial_amax(
-        self,
-        tensor: torch.Tensor,
-        amax: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_block_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
-
-    def fp8_block_scaling_partial_cast(
-        self,
-        inp: torch.Tensor,
-        out: torch.Tensor,
-        scale: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-        out_dtype: Any,
-    ) -> None:
-        tex = self._get_tex()
-        tex.fp8_block_scaling_partial_cast(inp, out, scale, h, w, start_offset, block_len, out_dtype)
-
-    def fused_multi_row_padding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_padding(*args, **kwargs)
-
-    def fused_multi_row_unpadding(self, *args, **kwargs) -> Any:
-        tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(*args, **kwargs)
-
+    # Misc
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
-
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
-
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
 
-    def thd_read_half_tensor(self, *args, **kwargs) -> Any:
+    # Support THD format for Context Parallel
+    def thd_read_half_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        half_idx: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_half_tensor(*args, **kwargs)
-
-    def thd_second_half_lse_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+    def thd_second_half_lse_correction(
+        self,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(*args, **kwargs)
-
-    def thd_read_second_half_lse(self, *args, **kwargs) -> Any:
+        return tex.thd_second_half_lse_correction(
+            lse, lse_per_step, cu_seqlens, lse_packed
+        )
+    def thd_read_second_half_lse(
+        self,
+        lse: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+        second_half_lse_seqlen: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(*args, **kwargs)
-
-    def thd_out_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_read_second_half_lse(
+            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
+        )
+    def thd_out_correction(
+        self,
+        out: torch.Tensor,
+        out_per_step: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        only_second_half: bool,
+        lse_packed: bool,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_out_correction(*args, **kwargs)
-
-    def thd_grad_correction(self, *args, **kwargs) -> Any:
+        return tex.thd_out_correction(
+            out, out_per_step, lse, lse_per_step,
+            cu_seqlens, only_second_half, lse_packed
+        )
+    def thd_grad_correction(
+        self,
+        grad: torch.Tensor,
+        grad_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        first_half: str,
+        second_half: str,
+    ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(*args, **kwargs)
-
-    def thd_get_partitioned_indices(self, *args, **kwargs) -> Any:
+        return tex.thd_grad_correction(
+            grad, grad_per_step, cu_seqlens,
+            first_half, second_half
+        )
+    def thd_get_partitioned_indices(
+        self,
+        cu_seqlens: torch.Tensor,
+        total_tokens: int,
+        world_size: int,
+        rank: int,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(*args, **kwargs)
+        return tex.thd_get_partitioned_indices(
+            cu_seqlens, total_tokens, world_size, rank
+        )
 
-    def init_nvshmem_backend(self, *args, **kwargs) -> None:
+    # nvshmem functions
+    def init_nvshmem_backend(
+        self,
+        process_group: Any,
+    ) -> None:
         tex = self._get_tex()
-        tex.init_nvshmem_backend(*args, **kwargs)
-
-    def create_nvshmem_tensor(self, *args, **kwargs) -> torch.Tensor:
+        return tex.init_nvshmem_backend(process_group)
+    def create_nvshmem_tensor(
+        self,
+        shape: List[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.create_nvshmem_tensor(*args, **kwargs)
-
-    def nvshmem_send_on_current_stream(self, *args, **kwargs) -> None:
+        return tex.create_nvshmem_tensor(shape, dtype)
+    def nvshmem_send_on_current_stream(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        peer: int,
+        signal: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.nvshmem_send_on_current_stream(*args, **kwargs)
-
-    def nvshmem_wait_on_current_stream(self, *args, **kwargs) -> None:
+        return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+    def nvshmem_wait_on_current_stream(
+        self,
+        signal: torch.Tensor,
+        wait_kind: str,
+    ) -> None:
         tex = self._get_tex()
-        tex.nvshmem_wait_on_current_stream(*args, **kwargs)
-
+        return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
-        tex.nvshmem_finalize()
+        return tex.nvshmem_finalize()
 
+    # multi-tensor functions
     def multi_tensor_scale(
         self,
         chunk_size: int,
@@ -936,98 +1155,195 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
-
+        return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
-
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        scale: torch.Tensor,
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
-        return tex.multi_tensor_unscale_l2norm(chunk_size, noop_flag, tensor_lists, scale, per_tensor)
-
+        return tex.multi_tensor_unscale_l2norm(
+            chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
+        )
     def multi_tensor_adam(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
-        tex = self._get_tex()
-        if chunk_size is None:
-            return tex.multi_tensor_adam
-        tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2,
-            eps, step, mode, bias_correction, weight_decay
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
         )
-
-    def multi_tensor_adam_param_remainder(self, *args, **kwargs) -> None:
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_param_remainder(*args, **kwargs)
-
-    def multi_tensor_adam_fp8(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_param_remainder(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay
+        )
+    def multi_tensor_adam_fp8(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        fp8_dtype: DType,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_fp8(*args, **kwargs)
-
-    def multi_tensor_adam_capturable(self, *args, **kwargs) -> None:
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.multi_tensor_adam_fp8(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            fp8_dtype
+        )
+    def multi_tensor_adam_capturable(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable(*args, **kwargs)
-
-    def multi_tensor_adam_capturable_master(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_adam_capturable_master(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_adam_capturable_master(*args, **kwargs)
-
-    def multi_tensor_sgd(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_adam_capturable_master(
+            chunk_size, noop_flag, tensor_lists,
+            lr, beta1, beta2, epsilon,
+            step, mode, bias_correction, weight_decay,
+            inv_scale
+        )
+    def multi_tensor_sgd(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_sgd(*args, **kwargs)
-
-    def multi_tensor_compute_scale_and_scale_inv(self, *args, **kwargs) -> None:
+        return tex.multi_tensor_sgd(
+            chunk_size, noop_flag, tensor_lists,
+            wd, momentum, dampening,
+            lr, nesterov, first_run,
+            wd_after_momentum, scale
+        )
+    def multi_tensor_compute_scale_and_scale_inv(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        max_fp8: float,
+        force_pow_2_scales: bool,
+        epsilon: float,
+    ) -> None:
         tex = self._get_tex()
-        tex.multi_tensor_compute_scale_and_scale_inv(*args, **kwargs)
+        return tex.multi_tensor_compute_scale_and_scale_inv(
+            chunk_size, noop_flag, tensor_lists,
+            max_fp8, force_pow_2_scales, epsilon
+        )
 
+    # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
-        allgather_communicator: Any,
+        allgather_communicator: CommOverlap,
         send_stream: Any,
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
         return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
 
+############## class func #################################
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionMETAX
+        return FlashAttentionMETAX
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
-
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
         intra_node_group: Optional[Any] = None,
-    ) -> Any:
+    ) -> "CommOverlapHelper":
         tex = self._get_tex()
-        if world_group is None:
-            return tex.CommOverlapHelper()
         return tex.CommOverlapHelper(world_group, intra_node_group)
-
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1043,7 +1359,7 @@ def create_comm_overlap(
         set_sm_margin: bool = True,
         atomic_gemm: bool = False,
         rs_overlap_first_gemm: bool = False,
-    ) -> Any:
+    ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
             buffer_shape, buffer_dtype, helper, tp_size,
@@ -1051,7 +1367,6 @@ def create_comm_overlap(
             gemm_priority, comm_priority, num_comm_sm,
             set_sm_margin, atomic_gemm, rs_overlap_first_gemm
         )
-
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1068,7 +1383,7 @@ def create_comm_overlap_p2p(
         atomic_gemm: bool = False,
         use_ce: bool = True,
         aggregate: bool = False,
-    ) -> Any:
+    ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
             buffer_shape, buffer_dtype, helper, tp_size, comm_type,
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 988829b98c..74357394e8 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -11,6 +11,7 @@
 from .logger_manager import get_logger
 logger = get_logger()
 
+################### Enums ###################
 class DType(IntEnum):
     kByte = 0
     kInt16 = 1
@@ -141,94 +142,260 @@ class CommOverlapAlgo(IntEnum):
     ATOMIC_GEMM_RS_P2P = 7
     EXTERNAL_BULK_OVERLAP_AG = 8
 
-class FP8TensorMeta:
-    def __init__(self):
-        self.scale: Optional[torch.Tensor] = None
-        self.scale_inv: Optional[torch.Tensor] = None
-        self.amax_history: Optional[torch.Tensor] = None
-
-class CommGemmOverlapAlgoConfig:
-    def __init__(self, *args, **kwargs):
-        pass
-
-class FusedAdamCUDAKernel:
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError(
-            "FusedAdamCUDAKernel requires CUDA extensions. "
-            "Not supported in FL mode."
-        )
+############ Class #################
 
-class FusedSGDCUDAKernel:
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError(
-            "FusedSGDCUDAKernel requires CUDA extensions. "
-            "Not supported in FL mode."
-        )
+class FP8TensorMeta:
+    """
+    FP8TensorMeta wrapper that routes to the appropriate backend implementation.
+    """
+    def __new__(cls, *args, **kwargs):
+        from .manager import get_default_manager
+        return get_default_manager().call("create_fp8_tensor_meta", *args, **kwargs)
 
 class CommOverlapHelper:
-    def __init__(self, world_group=None, intra_node_group=None):
-        self.world_group = world_group
-        self.intra_node_group = intra_node_group
+    """
+    CommOverlapHelper wrapper that routes to the appropriate backend implementation.
+    """
+    def __new__(cls, *args, **kwargs):
+        from .manager import get_default_manager
+        return get_default_manager().call("create_comm_overlap_helper", *args, **kwargs)
 
 class CommOverlap:
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError(
-            "CommOverlap should be created via backend.create_comm_overlap(). "
-            "Direct instantiation is not supported in FL mode."
-        )
+    """
+    CommOverlap wrapper that routes to the appropriate backend implementation.
+    """
+    def __new__(cls, *args, **kwargs):
+        from .manager import get_default_manager
+        return get_default_manager().call("create_comm_overlap", *args, **kwargs)
 
 class CommOverlapP2P:
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError(
-            "CommOverlapP2P should be created via backend.create_comm_overlap_p2p(). "
-            "Direct instantiation is not supported in FL mode."
+    """
+    CommOverlapP2P wrapper that routes to the appropriate backend implementation.
+    """
+    def __new__(cls, *args, **kwargs):
+        from .manager import get_default_manager
+        return get_default_manager().call("create_comm_overlap_p2p", *args, **kwargs)
+
+class FlashAttentionBase(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.softmax_scale = softmax_scale
+        self.attention_dropout = attention_dropout
+        self.attention_dropout_ctx = attention_dropout_ctx or nullcontext
+        self.attention_type = attention_type
+        self.layer_number = 1 if layer_number is None else layer_number
+        self.deterministic = deterministic
+
+        # For fallback support
+        self._manager = None
+        self._init_params = None
+
+    @abstractmethod
+    def _forward_impl(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        """
+        Actual forward implementation - subclasses must implement this.
+
+        This method contains the backend-specific logic for flash attention.
+        """
+        raise NotImplementedError("Subclasses must implement _forward_impl()")
+
+    def forward(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.cuda.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        """
+        Forward pass with automatic fallback support and caching.
+        Delegates to OpManager.call_with_custom_impl for unified dispatch.
+        """
+        if self._manager is None:
+            return self._forward_impl(
+                query_layer=query_layer,
+                key_layer=key_layer,
+                value_layer=value_layer,
+                attention_mask=attention_mask,
+                qkv_layout=qkv_layout,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                attn_mask_type=attn_mask_type,
+                window_size=window_size,
+                alibi_slopes=alibi_slopes,
+                cp_group=cp_group,
+                cp_global_ranks=cp_global_ranks,
+                cp_stream=cp_stream,
+                cp_comm_type=cp_comm_type,
+                fp8=fp8,
+                fp8_meta=fp8_meta,
+                quantizers=quantizers,
+                inference_params=inference_params,
+                flash_attention_backend=flash_attention_backend,
+                fp8_output=fp8_output,
+            )
+
+        def call_impl_fn(impl_class):
+            if impl_class == self.__class__:
+                return self._forward_impl(
+                    query_layer=query_layer,
+                    key_layer=key_layer,
+                    value_layer=value_layer,
+                    attention_mask=attention_mask,
+                    qkv_layout=qkv_layout,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_kv=cu_seqlens_kv,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_kv=max_seqlen_kv,
+                    attn_mask_type=attn_mask_type,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    cp_group=cp_group,
+                    cp_global_ranks=cp_global_ranks,
+                    cp_stream=cp_stream,
+                    cp_comm_type=cp_comm_type,
+                    fp8=fp8,
+                    fp8_meta=fp8_meta,
+                    quantizers=quantizers,
+                    inference_params=inference_params,
+                    flash_attention_backend=flash_attention_backend,
+                    fp8_output=fp8_output,
+                )
+            else:
+                fallback_instance = impl_class(**self._init_params)
+                fallback_instance._manager = self._manager
+                fallback_instance._init_params = self._init_params
+                return fallback_instance._forward_impl(
+                    query_layer=query_layer,
+                    key_layer=key_layer,
+                    value_layer=value_layer,
+                    attention_mask=attention_mask,
+                    qkv_layout=qkv_layout,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_kv=cu_seqlens_kv,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_kv=max_seqlen_kv,
+                    attn_mask_type=attn_mask_type,
+                    window_size=window_size,
+                    alibi_slopes=alibi_slopes,
+                    cp_group=cp_group,
+                    cp_global_ranks=cp_global_ranks,
+                    cp_stream=cp_stream,
+                    cp_comm_type=cp_comm_type,
+                    fp8=fp8,
+                    fp8_meta=fp8_meta,
+                    quantizers=quantizers,
+                    inference_params=inference_params,
+                    flash_attention_backend=flash_attention_backend,
+                    fp8_output=fp8_output,
+                )
+
+        return self._manager.call_with_custom_impl(
+            op_name="get_flash_attention_class",
+            current_impl_class=self.__class__,
+            call_impl_fn=call_impl_fn,
         )
 
+    @property
+    def backend_name(self) -> str:
+        return self.__class__.__name__
+
+############ Base  ###################
 class TEFLBackendBase(ABC):
     @abstractmethod
     def is_available(self) -> bool:
         raise NotImplementedError
 
-    def get_flash_attention_class(self) -> Type["FlashAttentionBase"]:
-        raise NotImplementedError
-
     def get_attention_backend(self, attention_params=None):
         raise NotImplementedError
 
+##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
         quantizer: Any,
-        output: Optional[torch.Tensor] = None,
+        output: Optional[Any] = None,
         noop: Optional[torch.Tensor] = None,
     ) -> Any:
         raise NotImplementedError
 
     def dequantize(
         self,
-        input: torch.Tensor,
-        otype: torch.dtype,
-    ) -> torch.Tensor:
+        input: Any,
+        otype: DType,
+    ) -> Any:
         raise NotImplementedError
 
     def bgrad_quantize(
         self,
         input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         raise NotImplementedError
 
     def generic_gemm(
         self,
-        A: torch.Tensor,
+        A: Any,
         transA: bool,
-        B: torch.Tensor,
+        B: Any,
         transB: bool,
-        D: torch.Tensor,
+        D: Any,
         quantizer: Any,
-        output_dtype: torch.dtype,
+        output_dtype: Optional[DType],
         bias: Optional[torch.Tensor],
-        bias_type: Any,
+        bias_type: DType,
         gelu: bool,
         gelu_in: Optional[torch.Tensor],
         grad: bool,
@@ -237,91 +404,77 @@ def generic_gemm(
         accumulate: bool,
         use_split_accumulator: bool,
         comm_overlap: Optional[Any] = None,
-        comm_type: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
         extra_output: Optional[torch.Tensor] = None,
         bulk_overlap: bool = False,
         alpha: float = 1.0,
         beta: Optional[float] = None,
-    ) -> Any:
-        raise NotImplementedError
-
-    def te_general_grouped_gemm(
-        self,
-        *args,
-        **kwargs,
-    ) -> Any:
+    ) -> List[Any]:
         raise NotImplementedError
 
+    # GELU and variants #
     def gelu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def geglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def qgelu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def qgeglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
+    # ReLU and variants #
     def relu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def reglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def srelu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def sreglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
+    # SwiGLU and variants #
     def silu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def swiglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -330,7 +483,7 @@ def clamped_swiglu(
         alpha: float = 1.702,
     ) -> Any:
         raise NotImplementedError
-
+    # Backward of GELU and variants #
     def dgelu(
         self,
         grad: torch.Tensor,
@@ -338,7 +491,6 @@ def dgelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dgeglu(
         self,
         grad: torch.Tensor,
@@ -346,7 +498,6 @@ def dgeglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dqgelu(
         self,
         grad: torch.Tensor,
@@ -354,7 +505,6 @@ def dqgelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dqgeglu(
         self,
         grad: torch.Tensor,
@@ -362,7 +512,7 @@ def dqgeglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
+    # Backward of ReLU and variants #
     def drelu(
         self,
         grad: torch.Tensor,
@@ -370,7 +520,6 @@ def drelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dreglu(
         self,
         grad: torch.Tensor,
@@ -378,7 +527,6 @@ def dreglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dsrelu(
         self,
         grad: torch.Tensor,
@@ -386,7 +534,6 @@ def dsrelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dsreglu(
         self,
         grad: torch.Tensor,
@@ -394,7 +541,7 @@ def dsreglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
+    # Backward of SiLU and variants #
     def dsilu(
         self,
         grad: torch.Tensor,
@@ -402,7 +549,6 @@ def dsilu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def dswiglu(
         self,
         grad: torch.Tensor,
@@ -410,7 +556,6 @@ def dswiglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
-
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -420,103 +565,193 @@ def clamped_dswiglu(
         alpha: float = 1.702,
     ) -> Any:
         raise NotImplementedError
-
+    # DBias + DAct fusions #
     def dbias_dgelu(
         self,
         grad: torch.Tensor,
         fwd_input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         raise NotImplementedError
-
     def dbias_dsilu(
         self,
         grad: torch.Tensor,
         fwd_input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         raise NotImplementedError
-
     def dbias_drelu(
         self,
         grad: torch.Tensor,
         fwd_input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         raise NotImplementedError
-
     def dbias_dqgelu(
         self,
         grad: torch.Tensor,
         fwd_input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         raise NotImplementedError
-
     def dbias_dsrelu(
         self,
         grad: torch.Tensor,
         fwd_input: torch.Tensor,
         quantizer: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> List[Any]:
         raise NotImplementedError
-
-    def layernorm_fwd(
+    # Permutation functions                                                                                                                                                                                                                                                                                             
+    def moe_permute_fwd(
         self,
         input: torch.Tensor,
-        weight: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        eps: float,
-        ln_out: Optional[torch.Tensor],
-        quantizer: Any,
-        otype: torch.dtype,
-        sm_margin: int,
-        zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        dtype: DType,
+        indices: torch.Tensor,
+        num_out_tokens: int,
+        workspace: List[torch.Tensor],
+        max_expanded_token_num: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         raise NotImplementedError
-
-    def layernorm_bwd(
+    def moe_permute_bwd(
         self,
-        dy: torch.Tensor,
-        x: torch.Tensor,
-        mu: torch.Tensor,
-        rsigma: torch.Tensor,
-        gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
-    def rmsnorm_fwd(
+    def moe_unpermute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def moe_unpermute_bwd(
+        self,
+        input_bwd: torch.Tensor,
+        input_fwd: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    # Other granular functions
+    def layernorm_fwd(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
         eps: float,
-        ln_out: Optional[torch.Tensor],
+        ln_out: Any,
         quantizer: Any,
-        otype: torch.dtype,
+        otype: DType,
         sm_margin: int,
         zero_centered_gamma: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> List[Any]:
+        raise NotImplementedError
+    def layernorm_bwd(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
+        raise NotImplementedError
+    def rmsnorm_fwd(
+        self,
+        input: Any,
+        weight: Any,
+        eps: float,
+        ln_out: Any,
+        quantizer: Any,
+        otype: DType,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         raise NotImplementedError
-
     def rmsnorm_bwd(
         self,
-        dy: torch.Tensor,
+        dz: torch.Tensor,
         x: torch.Tensor,
         rsigma: torch.Tensor,
         gamma: torch.Tensor,
-        sm_margin: int = 0,
-        zero_centered_gamma: bool = False,
-        eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         raise NotImplementedError
-
     def rmsnorm_bwd_add(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        add: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
         raise NotImplementedError
 
     def multi_tensor_quantize(
@@ -525,7 +760,6 @@ def multi_tensor_quantize(
         quantizer_list: List[Any],
     ) -> List[Any]:
         raise NotImplementedError
-
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -533,177 +767,290 @@ def split_quantize(
         quantizer_list: List[Any],
     ) -> List[Any]:
         raise NotImplementedError
-
-    def moe_permute_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError
-
-    def moe_permute_bwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError
-
-    def moe_unpermute_fwd(self, *args, **kwargs) -> Any:
-        raise NotImplementedError
-
-    def moe_unpermute_bwd(self, *args, **kwargs) -> Any:
+    def te_general_grouped_gemm(
+        self,
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
         raise NotImplementedError
-
-    def scaled_softmax_forward(
+    def fp8_transpose(
         self,
         input: torch.Tensor,
-        scale: float,
+        dtype: DType,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         raise NotImplementedError
-
-    def scaled_softmax_backward(
+    def swap_first_dims(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        tensor: torch.Tensor,
+        out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         raise NotImplementedError
+    def get_fused_attn_backend(
+        self,
+        is_training: bool,
+        q_dtype: DType,
+        kv_dtype: DType,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        p_dropout: float,
+        num_attn_heads: int,
+        num_gqa_groups: int,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        head_dim_qk: int,
+        head_dim_v: int,
+        window_size_left: int,
+        window_size_right: int,
+        return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
+        raise NotImplementedError
 
-    def scaled_masked_softmax_forward(
+    def compute_amax(
         self,
         input: torch.Tensor,
-        mask: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax: torch.Tensor,
+    ) -> None:
         raise NotImplementedError
-
-    def scaled_masked_softmax_backward(
+    def fused_amax_and_scale_update_after_reduction(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        amax_reduction_buffer: torch.Tensor,
+        amax_histories: List[torch.Tensor],
+        scales: List[torch.Tensor],
+        amax_compute_algo: str,
+        fp8_dtype: DType,
+        margin: float,
+    ) -> None:
         raise NotImplementedError
-
-    def scaled_upper_triang_masked_softmax_forward(
+    def fp8_block_scaling_compute_partial_amax(
         self,
-        input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
         raise NotImplementedError
-
-    def scaled_upper_triang_masked_softmax_backward(
+    def fp8_block_scaling_partial_cast(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
         raise NotImplementedError
-
-    def scaled_aligned_causal_masked_softmax_forward(
+    def fused_multi_row_padding(
         self,
         input: torch.Tensor,
-        scale: float,
-    ) -> torch.Tensor:
+        output: torch.Tensor,
+        input_row_list: List[int],
+        padded_input_row_list: List[int],
+    ) -> None:
+        raise NotImplementedError
+    def fused_multi_row_unpadding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        unpadded_input_row_list: List[int],
+    ) -> None:
         raise NotImplementedError
 
-    def scaled_aligned_causal_masked_softmax_backward(
+    # attention kernels
+    def fa_prepare_fwd(
         self,
-        output_grad: torch.Tensor,
-        softmax_output: torch.Tensor,
-        scale: float,
+        qkvi: torch.Tensor,
     ) -> torch.Tensor:
         raise NotImplementedError
-
-    def get_fused_attn_backend(
+    def fa_prepare_bwd(
         self,
-        *args,
-        **kwargs,
-    ) -> int:
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def fused_attn_fwd(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        is_training: bool,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        fake_dtype: torch.dtype,
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        page_table_k: Optional[torch.Tensor],
+        page_table_v: Optional[torch.Tensor],
+        s_quantizer: Any,
+        o_quantizer: Any,
+        Bias: Optional[torch.Tensor],
+        SoftmaxOffset: Optional[torch.Tensor],
+        rng_gen: Optional[torch.Generator],
+        rng_elts_per_thread: int,
+        return_max_logit: bool,
+    ) -> List[Any]:
         raise NotImplementedError
-
     def fused_attn_bwd(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
-        raise NotImplementedError
-
-    def fa_prepare_fwd(
-        self,
-        *args,
-        **kwargs,
-    ) -> Any:
-        raise NotImplementedError
-
-    def fa_prepare_bwd(
-        self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        deterministic: bool,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        O: Any,
+        dO: Any,
+        fake_dtype: torch.dtype,
+        dqkv_type: DType,
+        Aux_CTX_Tensors: List[torch.Tensor],
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        s_quantizer: Any,
+        dp_quantizer: Any,
+        dqkv_quantizer: Any,
+    ) -> List[Any]:
         raise NotImplementedError
-
     def copy_to_kv_cache(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        new_k: torch.Tensor,
+        new_v: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        page_table: torch.Tensor,
+        cu_new_lens: torch.Tensor,
+        cu_cached_lens: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        b: int,
+        max_ctx_len: int,
+        max_seq_len: int,
+        max_pages_per_seq: int,
+        is_non_paged: bool,
+    ) -> None:
         raise NotImplementedError
-
     def convert_thd_to_bshd(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        b: int,
+        max_seq_len: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def convert_bshd_to_thd(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        t: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
+    # fused apply rope
     def fused_rope_forward(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        input: torch.Tensor,
+        freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def fused_rope_backward(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        output_grads: torch.Tensor,
+        freqs: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def fused_qkv_rope_forward(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        qkv_input: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def fused_qkv_rope_backward(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        q_grad_out: torch.Tensor,
+        k_grad_out: torch.Tensor,
+        v_grad_out: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
+    # fused router
     def fused_topk_with_score_function_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        num_groups: int,
-        group_topk: int,
-        scaling_factor: float,
-        score_function: Any,
+        num_groups: Optional[int],
+        group_topk: Optional[int],
+        scaling_factor: Optional[float],
+        score_function: str,
         expert_bias: Optional[torch.Tensor],
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -713,19 +1060,17 @@ def fused_topk_with_score_function_bwd(
         grad_probs: torch.Tensor,
         topk: int,
         use_pre_softmax: bool,
-        scaling_factor: float,
-        score_function: Any,
-    ) -> Any:
+        scaling_factor: Optional[float],
+        score_function: str,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -733,10 +1078,9 @@ def fused_score_for_moe_aux_loss_bwd(
         intermediate_output: torch.Tensor,
         grad_scores: torch.Tensor,
         topk: int,
-        score_function: Any,
-    ) -> Any:
+        score_function: str,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -747,9 +1091,8 @@ def fused_moe_aux_loss_fwd(
         num_cols: int,
         topk: int,
         coeff: float,
-    ) -> Any:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -757,177 +1100,117 @@ def fused_moe_aux_loss_bwd(
         num_rows: int,
         num_cols: int,
         grad_aux_loss: torch.Tensor,
-    ) -> Any:
+    ) -> torch.Tensor:
         raise NotImplementedError
 
+    # Dropout
     def dropout_fwd(
         self,
         input: torch.Tensor,
         dropout_probability: float,
-        out: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
         mask: torch.Tensor,
         dropout_probability: float,
-        grad_input: Optional[torch.Tensor] = None,
+        grad_input: Optional[torch.Tensor],
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fp8_transpose(
-        self,
-        input: torch.Tensor,
-        dtype: Any,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        raise NotImplementedError
-
-    def swap_first_dims(
-        self,
-        tensor: torch.Tensor,
-        *,
-        out: torch.Tensor,
-    ) -> None:
-        raise NotImplementedError
-
-    def compute_amax(
-        self,
-        input: torch.Tensor,
-        amax: torch.Tensor,
-    ) -> None:
-        raise NotImplementedError
-
-    def fused_amax_and_scale_update_after_reduction(
-        self,
-        *args,
-        **kwargs,
-    ) -> None:
-        raise NotImplementedError
-
-    def fp8_block_scaling_compute_partial_amax(
-        self,
-        tensor: torch.Tensor,
-        amax: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-    ) -> None:
-        raise NotImplementedError
-
-    def fp8_block_scaling_partial_cast(
-        self,
-        inp: torch.Tensor,
-        out: torch.Tensor,
-        scale: torch.Tensor,
-        h: int,
-        w: int,
-        start_offset: int,
-        block_len: int,
-        out_dtype: Any,
-    ) -> None:
-        raise NotImplementedError
-
-    def fused_multi_row_padding(
-        self,
-        *args,
-        **kwargs,
-    ) -> Any:
-        raise NotImplementedError
-
-    def fused_multi_row_unpadding(
-        self,
-        *args,
-        **kwargs,
-    ) -> Any:
-        raise NotImplementedError
-
+    # Misc
     def get_cublasLt_version(self) -> int:
         raise NotImplementedError
-
     def get_cudnn_version(self) -> int:
         raise NotImplementedError
-
     def get_num_cublas_streams(self) -> int:
         raise NotImplementedError
 
+    # Support THD format for Context Parallel
     def thd_read_half_tensor(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        half_idx: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def thd_second_half_lse_correction(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+    ) -> None:
         raise NotImplementedError
-
     def thd_read_second_half_lse(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        lse: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+        second_half_lse_seqlen: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
-
     def thd_out_correction(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        out: torch.Tensor,
+        out_per_step: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        only_second_half: bool,
+        lse_packed: bool,
+    ) -> None:
         raise NotImplementedError
-
     def thd_grad_correction(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        grad: torch.Tensor,
+        grad_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        first_half: str,
+        second_half: str,
+    ) -> None:
         raise NotImplementedError
-
     def thd_get_partitioned_indices(
         self,
-        *args,
-        **kwargs,
-    ) -> Any:
+        cu_seqlens: torch.Tensor,
+        total_tokens: int,
+        world_size: int,
+        rank: int,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
+    # nvshmem functions
     def init_nvshmem_backend(
         self,
-        *args,
-        **kwargs,
+        process_group: Any,
     ) -> None:
         raise NotImplementedError
-
     def create_nvshmem_tensor(
         self,
-        *args,
-        **kwargs,
+        shape: List[int],
+        dtype: torch.dtype,
     ) -> torch.Tensor:
         raise NotImplementedError
-
     def nvshmem_send_on_current_stream(
         self,
-        *args,
-        **kwargs,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        peer: int,
+        signal: torch.Tensor,
     ) -> None:
         raise NotImplementedError
-
     def nvshmem_wait_on_current_stream(
         self,
-        *args,
-        **kwargs,
+        signal: torch.Tensor,
+        wait_kind: str,
     ) -> None:
         raise NotImplementedError
-
     def nvshmem_finalize(self) -> None:
         raise NotImplementedError
 
+    # multi-tensor functions
     def multi_tensor_scale(
         self,
         chunk_size: int,
@@ -936,102 +1219,150 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         raise NotImplementedError
-
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
         noop_flag: torch.Tensor,
         tensor_lists: List[List[torch.Tensor]],
-        scale: torch.Tensor,
-        per_tensor: bool = False,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
-
     def multi_tensor_adam(
         self,
-        chunk_size: int = None,
-        noop_flag: torch.Tensor = None,
-        tensor_lists: List[List[torch.Tensor]] = None,
-        lr: float = None,
-        beta1: float = None,
-        beta2: float = None,
-        eps: float = None,
-        step: int = None,
-        mode: int = None,
-        bias_correction: int = None,
-        weight_decay: float = None,
-    ):
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
         raise NotImplementedError
-
     def multi_tensor_adam_param_remainder(
         self,
-        *args,
-        **kwargs,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
     ) -> None:
         raise NotImplementedError
-
     def multi_tensor_adam_fp8(
         self,
-        *args,
-        **kwargs,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        fp8_dtype: DType,
     ) -> None:
         raise NotImplementedError
-
     def multi_tensor_adam_capturable(
         self,
-        *args,
-        **kwargs,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
     ) -> None:
         raise NotImplementedError
-
     def multi_tensor_adam_capturable_master(
         self,
-        *args,
-        **kwargs,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
     ) -> None:
         raise NotImplementedError
-
     def multi_tensor_sgd(
         self,
-        *args,
-        **kwargs,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
     ) -> None:
         raise NotImplementedError
-
     def multi_tensor_compute_scale_and_scale_inv(
         self,
-        *args,
-        **kwargs,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        max_fp8: float,
+        force_pow_2_scales: bool,
+        epsilon: float,
     ) -> None:
         raise NotImplementedError
 
+    # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
-        allgather_communicator: Any,
+        allgather_communicator: CommOverlap,
         send_stream: Any,
         recv_stream: Any,
     ) -> Any:
         raise NotImplementedError
 
+############## class func #################################
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        """Create FP8TensorMeta instance."""
         raise NotImplementedError
-
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
         intra_node_group: Optional[Any] = None,
-    ) -> Any:
+    ) -> "CommOverlapHelper":
+        """
+        Internal method to create CommOverlapHelper.
+        Users should use CommOverlapHelper(...) directly.
+        """
         raise NotImplementedError
-
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1047,9 +1378,12 @@ def create_comm_overlap(
         set_sm_margin: bool = True,
         atomic_gemm: bool = False,
         rs_overlap_first_gemm: bool = False,
-    ) -> Any:
+    ) -> "CommOverlap":
+        """
+        Internal method to create CommOverlap.
+        Users should use CommOverlap(...) directly.
+        """
         raise NotImplementedError
-
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1066,187 +1400,16 @@ def create_comm_overlap_p2p(
         atomic_gemm: bool = False,
         use_ce: bool = True,
         aggregate: bool = False,
-    ) -> Any:
-        raise NotImplementedError
-
-class FlashAttentionBase(torch.nn.Module, ABC):
-
-    def __init__(
-        self,
-        softmax_scale: float,
-        attention_dropout: float = 0.0,
-        attention_dropout_ctx: Optional[Callable] = None,
-        attention_type: str = "self",
-        layer_number: Optional[int] = None,
-        deterministic: bool = False,
-    ) -> None:
-        super().__init__()
-
-        self.softmax_scale = softmax_scale
-        self.attention_dropout = attention_dropout
-        self.attention_dropout_ctx = attention_dropout_ctx or nullcontext
-        self.attention_type = attention_type
-        self.layer_number = 1 if layer_number is None else layer_number
-        self.deterministic = deterministic
-
-        # For fallback support
-        self._manager = None
-        self._init_params = None
-
-    @abstractmethod
-    def _forward_impl(
-        self,
-        query_layer: torch.Tensor,
-        key_layer: torch.Tensor,
-        value_layer: torch.Tensor,
-        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
-        qkv_layout: str = "sbh3d",
-        cu_seqlens_q: Optional[torch.Tensor] = None,
-        cu_seqlens_kv: Optional[torch.Tensor] = None,
-        max_seqlen_q: Optional[int] = None,
-        max_seqlen_kv: Optional[int] = None,
-        attn_mask_type: str = "causal",
-        window_size: Optional[Tuple[int, int]] = None,
-        alibi_slopes: Optional[torch.Tensor] = None,
-        cp_group: Optional[Any] = None,
-        cp_global_ranks: Optional[List[int]] = None,
-        cp_stream: Optional[torch.cuda.Stream] = None,
-        cp_comm_type: str = "p2p",
-        fp8: bool = False,
-        fp8_meta: Optional[Dict[str, Any]] = None,
-        quantizers: Optional[Any] = None,
-        inference_params: Optional[Any] = None,
-        flash_attention_backend: Optional[Any] = None,
-        fp8_output: bool = False,
-    ) -> torch.Tensor:
-        """
-        Actual forward implementation - subclasses must implement this.
-
-        This method contains the backend-specific logic for flash attention.
-        """
-        raise NotImplementedError("Subclasses must implement _forward_impl()")
-
-    def forward(
-        self,
-        query_layer: torch.Tensor,
-        key_layer: torch.Tensor,
-        value_layer: torch.Tensor,
-        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
-        qkv_layout: str = "sbh3d",
-        cu_seqlens_q: Optional[torch.Tensor] = None,
-        cu_seqlens_kv: Optional[torch.Tensor] = None,
-        max_seqlen_q: Optional[int] = None,
-        max_seqlen_kv: Optional[int] = None,
-        attn_mask_type: str = "causal",
-        window_size: Optional[Tuple[int, int]] = None,
-        alibi_slopes: Optional[torch.Tensor] = None,
-        cp_group: Optional[Any] = None,
-        cp_global_ranks: Optional[List[int]] = None,
-        cp_stream: Optional[torch.cuda.Stream] = None,
-        cp_comm_type: str = "p2p",
-        fp8: bool = False,
-        fp8_meta: Optional[Dict[str, Any]] = None,
-        quantizers: Optional[Any] = None,
-        inference_params: Optional[Any] = None,
-        flash_attention_backend: Optional[Any] = None,
-        fp8_output: bool = False,
-    ) -> torch.Tensor:
+    ) -> "CommOverlapP2P":
         """
-        Forward pass with automatic fallback support and caching.
-        Delegates to OpManager.call_with_custom_impl for unified dispatch.
+        Internal method to create CommOverlapP2P.
+        Users should use CommOverlapP2P(...) directly.
         """
-        if self._manager is None:
-            return self._forward_impl(
-                query_layer=query_layer,
-                key_layer=key_layer,
-                value_layer=value_layer,
-                attention_mask=attention_mask,
-                qkv_layout=qkv_layout,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_kv=cu_seqlens_kv,
-                max_seqlen_q=max_seqlen_q,
-                max_seqlen_kv=max_seqlen_kv,
-                attn_mask_type=attn_mask_type,
-                window_size=window_size,
-                alibi_slopes=alibi_slopes,
-                cp_group=cp_group,
-                cp_global_ranks=cp_global_ranks,
-                cp_stream=cp_stream,
-                cp_comm_type=cp_comm_type,
-                fp8=fp8,
-                fp8_meta=fp8_meta,
-                quantizers=quantizers,
-                inference_params=inference_params,
-                flash_attention_backend=flash_attention_backend,
-                fp8_output=fp8_output,
-            )
-
-        def call_impl_fn(impl_class):
-            if impl_class == self.__class__:
-                return self._forward_impl(
-                    query_layer=query_layer,
-                    key_layer=key_layer,
-                    value_layer=value_layer,
-                    attention_mask=attention_mask,
-                    qkv_layout=qkv_layout,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_kv=cu_seqlens_kv,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_kv=max_seqlen_kv,
-                    attn_mask_type=attn_mask_type,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    cp_group=cp_group,
-                    cp_global_ranks=cp_global_ranks,
-                    cp_stream=cp_stream,
-                    cp_comm_type=cp_comm_type,
-                    fp8=fp8,
-                    fp8_meta=fp8_meta,
-                    quantizers=quantizers,
-                    inference_params=inference_params,
-                    flash_attention_backend=flash_attention_backend,
-                    fp8_output=fp8_output,
-                )
-            else:
-                fallback_instance = impl_class(**self._init_params)
-                fallback_instance._manager = self._manager
-                fallback_instance._init_params = self._init_params
-                return fallback_instance._forward_impl(
-                    query_layer=query_layer,
-                    key_layer=key_layer,
-                    value_layer=value_layer,
-                    attention_mask=attention_mask,
-                    qkv_layout=qkv_layout,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_kv=cu_seqlens_kv,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_kv=max_seqlen_kv,
-                    attn_mask_type=attn_mask_type,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    cp_group=cp_group,
-                    cp_global_ranks=cp_global_ranks,
-                    cp_stream=cp_stream,
-                    cp_comm_type=cp_comm_type,
-                    fp8=fp8,
-                    fp8_meta=fp8_meta,
-                    quantizers=quantizers,
-                    inference_params=inference_params,
-                    flash_attention_backend=flash_attention_backend,
-                    fp8_output=fp8_output,
-                )
-
-        return self._manager.call_with_custom_impl(
-            op_name="get_flash_attention_class",
-            current_impl_class=self.__class__,
-            call_impl_fn=call_impl_fn,
-        )
-
-    @property
-    def backend_name(self) -> str:
-        return self.__class__.__name__
-
+        raise NotImplementedError
+    def get_flash_attention_class(self) -> Type["FlashAttentionBase"]:
+        raise NotImplementedError
 
+############ Wapper  #################
 class TEFLModule:
     def __init__(self, manager=None):
         """
@@ -1259,12 +1422,11 @@ def __init__(self, manager=None):
         # Import here to avoid circular dependency
         from .manager import get_default_manager
         self._manager = manager if manager is not None else get_default_manager()
-
+        # emum
         self.DType = DType
         self.Float8BlockScaleTensorFormat = Float8BlockScaleTensorFormat
         self.FP8FwdTensors = FP8FwdTensors
         self.FP8BwdTensors = FP8BwdTensors
-        self.FP8TensorMeta = FP8TensorMeta
         self.NVTE_Activation_Type = NVTE_Activation_Type
         self.NVTE_Bias_Type = NVTE_Bias_Type
         self.NVTE_Mask_Type = NVTE_Mask_Type
@@ -1275,14 +1437,11 @@ def __init__(self, manager=None):
         self.CommOverlapType = CommOverlapType
         self.CommOverlapAlgo = CommOverlapAlgo
         self.CommGemmOverlapRole = CommGemmOverlapRole
-
+        # class
+        self.FP8TensorMeta = FP8TensorMeta
         self.CommOverlapHelper = CommOverlapHelper
         self.CommOverlap = CommOverlap
         self.CommOverlapP2P = CommOverlapP2P
-        self.CommGemmOverlapAlgoConfig = CommGemmOverlapAlgoConfig
-
-        self.FusedAdamCUDAKernel = FusedAdamCUDAKernel
-        self.FusedSGDCUDAKernel = FusedSGDCUDAKernel
 
     def __getattr__(self, name: str) -> Any:
         """
@@ -1316,8 +1475,7 @@ def __dir__(self):
             'FP8TensorMeta', 'NVTE_Activation_Type', 'NVTE_Bias_Type', 'NVTE_Mask_Type',
             'NVTE_Softmax_Type', 'NVTE_Fused_Attn_Backend', 'NVTE_QKV_Format', 'NVTE_QKV_Layout',
             'CommOverlapType', 'CommOverlapAlgo', 'CommGemmOverlapRole',
-            'CommOverlapHelper', 'CommOverlap', 'CommOverlapP2P', 'CommGemmOverlapAlgoConfig',
-            'FusedAdamCUDAKernel', 'FusedSGDCUDAKernel'
+            'CommOverlapHelper', 'CommOverlap', 'CommOverlapP2P', 
         ]
 
         # Add operator names from OpManager's registry
diff --git a/transformer_engine/plugin/tests/test_normalization.py b/transformer_engine/plugin/tests/test_normalization.py
index 6a6114a398..1083c8b02c 100644
--- a/transformer_engine/plugin/tests/test_normalization.py
+++ b/transformer_engine/plugin/tests/test_normalization.py
@@ -13,6 +13,7 @@
     TestCase,
     generate_random_tensor,
 )
+from transformer_engine.plugin.core.ops import DType
 
 
 class NormalizationTests(TestCase):
@@ -57,7 +58,7 @@ def test_layernorm_forward(self, shape=(2, 4, 8)):
             try:
                 output, mean, rsigma = backend.layernorm_fwd(
                     x, weight, bias, self.eps,
-                    None, None, torch.float32, 0, False
+                    None, None, DType.kFloat32, 0, False
                 )
                 self.assert_close(
                     output, ref_output, rtol=1e-5, atol=1e-7,
@@ -143,7 +144,7 @@ def test_rmsnorm_forward(self, shape=(2, 4, 8)):
             try:
                 output, _, rsigma = backend.rmsnorm_fwd(
                     x, weight, self.eps,
-                    None, None, torch.float32, 0, False
+                    None, None, DType.kFloat32, 0, False
                 )
                 self.assert_close(
                     output, ref_output, rtol=1e-5, atol=1e-7,
@@ -185,7 +186,7 @@ def test_rmsnorm_backward(self, shape=(2, 4, 8)):
 
                 grad_x, grad_weight = backend.rmsnorm_bwd(
                     grad_output, x_copy, rsigma.detach(),
-                    weight_copy, 0, False, self.eps
+                    weight_copy, 0, False
                 )
 
                 self.assert_close(
diff --git a/transformer_engine/plugin/tests/test_operations.py b/transformer_engine/plugin/tests/test_operations.py
index 0d64c7e753..0ebe470e91 100644
--- a/transformer_engine/plugin/tests/test_operations.py
+++ b/transformer_engine/plugin/tests/test_operations.py
@@ -13,6 +13,7 @@
     TestCase,
     generate_random_tensor,
 )
+from transformer_engine.plugin.core.ops import DType
 
 
 class OperationsTests(TestCase):
@@ -39,7 +40,7 @@ def test_gemm_basic(self, M=32, N=64, K=48):
 
                 output, _, _, _ = backend.generic_gemm(
                     A, False, B, False, D,
-                    None, torch.float32, None, None,
+                    None, DType.kFloat32, None, DType.kFloat32,
                     False, None, False,
                     workspace, 1024, False, False
                 )
@@ -71,7 +72,7 @@ def test_gemm_transpose_a(self, M=32, N=64, K=48):
 
                 output, _, _, _ = backend.generic_gemm(
                     A, True, B, False, D,
-                    None, torch.float32, None, None,
+                    None, DType.kFloat32, None, DType.kFloat32,
                     False, None, False,
                     workspace, 1024, False, False
                 )
@@ -103,7 +104,7 @@ def test_gemm_3d(self, B=2, M=16, N=32, K=24):
 
                 output, _, _, _ = backend.generic_gemm(
                     B_mat, False, A, False, D,
-                    None, torch.float32, None, None,
+                    None, DType.kFloat32, None, DType.kFloat32,
                     False, None, False,
                     workspace, 1024, False, False
                 )
@@ -181,7 +182,7 @@ def test_dropout(self, shape=(4, 8, 16)):
         for backend_name in self.backends:
             backend = get_backend(backend_name)
             try:
-                output, mask = backend.dropout_fwd(x, dropout_prob)
+                output, mask = backend.dropout_fwd(x, dropout_prob, None)
 
                 num_nonzero = (output != 0).sum().item()
                 total_elements = output.numel()
@@ -206,7 +207,7 @@ def test_dropout(self, shape=(4, 8, 16)):
                     )
 
                 grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
-                grad_input = backend.dropout_bwd(grad_output, mask, dropout_prob)
+                grad_input = backend.dropout_bwd(grad_output, mask, dropout_prob, None)
 
                 grad_nonzero_mask = (grad_input != 0)
                 output_nonzero_mask = (output != 0)
diff --git a/transformer_engine/plugin/tests/test_optimizer.py b/transformer_engine/plugin/tests/test_optimizer.py
index d4f72919ef..905c7ebbe2 100644
--- a/transformer_engine/plugin/tests/test_optimizer.py
+++ b/transformer_engine/plugin/tests/test_optimizer.py
@@ -201,7 +201,7 @@ def test_multi_tensor_adam(self, num_tensors=3, shape=(32, 64)):
                     lr=lr,
                     beta1=beta1,
                     beta2=beta2,
-                    eps=eps,
+                    epsilon=eps,
                     step=step,
                     mode=1,  # AdamW mode
                     bias_correction=1,
@@ -222,6 +222,155 @@ def test_multi_tensor_adam(self, num_tensors=3, shape=(32, 64)):
                 self.failed += 1
                 print(f"    ✗ {backend_name}: {e}")
 
+    def _fp32_to_param_remainder(self, fp32_tensor):
+        """Split FP32 tensor into int16 param (high 16 bits) + int16 remainder (low 16 bits).
+
+        Matches the CUDA split convention:
+        1. Extract high 16 bits as param, low 16 bits as remainder.
+        2. If remainder < 0, increment param (round up).
+        """
+        int32 = fp32_tensor.view(torch.int32)
+        rem = (int32 & 0xFFFF).to(torch.int16)
+        high = ((int32 >> 16) & 0xFFFF).to(torch.int16)
+        high = torch.where(rem < 0, high + 1, high)
+        # param is stored as bf16 (same bits as high int16)
+        param = high.view(torch.bfloat16)
+        return param, rem
+
+    def _param_remainder_to_fp32(self, param, remainder):
+        """Reconstruct FP32 from int16 param (high bits) + int16 remainder (low bits).
+
+        Matches the CUDA reconstruct convention:
+        1. If remainder < 0, decrement param (undo rounding).
+        2. Combine high and low 16 bits into FP32.
+        """
+        local_p = param.view(torch.int16).clone()
+        local_rem = remainder.clone()
+        local_p = torch.where(local_rem < 0, local_p - 1, local_p)
+        high = local_p.to(torch.int32) << 16
+        low = local_rem.to(torch.int32) & 0xFFFF
+        return (high | low).view(torch.float32)
+
+    def _reference_adam_param_remainder(
+        self, grads, params, exp_avgs, exp_avg_sqs, param_remainders,
+        lr, beta1, beta2, epsilon, step, mode, bias_correction, weight_decay
+    ):
+        """Pure-PyTorch reference for multi_tensor_adam_param_remainder."""
+        bc1 = 1 - beta1 ** step if bias_correction else 1.0
+        bc2 = 1 - beta2 ** step if bias_correction else 1.0
+        is_adamw = (mode == 1)
+
+        for g, p, m, v, p_rem in zip(
+            grads, params, exp_avgs, exp_avg_sqs, param_remainders
+        ):
+            g_float = g.float()
+            param_master = self._param_remainder_to_fp32(p, p_rem)
+
+            if not is_adamw and weight_decay != 0:
+                g_float = g_float + weight_decay * param_master
+
+            m.mul_(beta1).add_(g_float, alpha=1 - beta1)
+            v.mul_(beta2).addcmul_(g_float, g_float, value=1 - beta2)
+
+            m_corr = m / bc1
+            v_corr = v / bc2
+            denom = torch.sqrt(v_corr) + epsilon
+            update = m_corr / denom
+
+            if is_adamw and weight_decay != 0:
+                update = update + weight_decay * param_master
+
+            param_master = param_master - lr * update
+
+            new_p, new_rem = self._fp32_to_param_remainder(param_master)
+            p.view(torch.int16).copy_(new_p.view(torch.int16))
+            p_rem.copy_(new_rem)
+
+    def test_multi_tensor_adam_param_remainder(self, num_tensors=3, shape=(32, 64)):
+        print(f"\n  Testing multi_tensor_adam_param_remainder with {num_tensors} tensors of shape {shape}")
+
+        lr = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        eps = 1e-8
+        step = 1
+        weight_decay = 0.01
+        mode = 1  # AdamW
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            try:
+                # Create FP32 master weights, then split into param + remainder
+                master_weights = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                                  for _ in range(num_tensors)]
+                grads = [generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+                         for _ in range(num_tensors)]
+
+                params = []
+                remainders = []
+                for mw in master_weights:
+                    p, r = self._fp32_to_param_remainder(mw)
+                    params.append(p.clone())
+                    remainders.append(r.clone())
+
+                exp_avgs = [torch.zeros(shape, dtype=torch.float32, device=self.device)
+                            for _ in range(num_tensors)]
+                exp_avg_sqs = [torch.zeros(shape, dtype=torch.float32, device=self.device)
+                               for _ in range(num_tensors)]
+
+                # Clone for reference
+                ref_params = [p.clone() for p in params]
+                ref_remainders = [r.clone() for r in remainders]
+                ref_exp_avgs = [torch.zeros_like(m) for m in exp_avgs]
+                ref_exp_avg_sqs = [torch.zeros_like(v) for v in exp_avg_sqs]
+                ref_grads = [g.clone() for g in grads]
+
+                # Reference step
+                self._reference_adam_param_remainder(
+                    ref_grads, ref_params, ref_exp_avgs, ref_exp_avg_sqs, ref_remainders,
+                    lr, beta1, beta2, eps, step, mode, 1, weight_decay,
+                )
+
+                # Backend step
+                noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
+                backend.multi_tensor_adam_param_remainder(
+                    chunk_size=2048,
+                    noop_flag=noop_flag,
+                    tensor_lists=[grads, params, exp_avgs, exp_avg_sqs, remainders],
+                    lr=lr,
+                    beta1=beta1,
+                    beta2=beta2,
+                    epsilon=eps,
+                    step=step,
+                    mode=mode,
+                    bias_correction=1,
+                    weight_decay=weight_decay,
+                )
+
+                # Compare reconstructed FP32 master weights
+                for i in range(num_tensors):
+                    out_fp32 = self._param_remainder_to_fp32(params[i], remainders[i])
+                    ref_fp32 = self._param_remainder_to_fp32(ref_params[i], ref_remainders[i])
+                    self.assert_close(
+                        out_fp32, ref_fp32, rtol=1e-5, atol=1e-7,
+                        msg=f"multi_tensor_adam_param_remainder param {i} mismatch for {backend_name}"
+                    )
+                    self.assert_close(
+                        exp_avgs[i], ref_exp_avgs[i], rtol=1e-5, atol=1e-7,
+                        msg=f"multi_tensor_adam_param_remainder exp_avg {i} mismatch for {backend_name}"
+                    )
+                    self.assert_close(
+                        exp_avg_sqs[i], ref_exp_avg_sqs[i], rtol=1e-5, atol=1e-7,
+                        msg=f"multi_tensor_adam_param_remainder exp_avg_sq {i} mismatch for {backend_name}"
+                    )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘ {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ {backend_name}: {e}")
+
     def _reference_multi_tensor_unscale_l2norm(self, tensors, inv_scale, per_tensor=False):
         """Reference implementation for multi_tensor_unscale_l2norm.
 
@@ -258,7 +407,7 @@ def test_multi_tensor_unscale_l2norm(self, num_tensors=4, shape=(64, 128)):
                     chunk_size=2048,
                     noop_flag=noop_flag,
                     tensor_lists=[tensors],
-                    scale=inv_scale,
+                    inv_scale=inv_scale,
                     per_tensor=False
                 )
 
@@ -298,6 +447,9 @@ def run_all_tests(self):
         # multi_tensor_adam tests
         self.test_multi_tensor_adam(num_tensors=3, shape=(32, 64))
 
+        # multi_tensor_adam_param_remainder tests
+        self.test_multi_tensor_adam_param_remainder(num_tensors=3, shape=(32, 64))
+
         return self.report()
 
 
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 6c0f969e47..1ca1855f8f 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -508,7 +508,6 @@ def forward(
                     FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module
             ctx.wgrad_store = wgrad_store
             ctx.debug = debug
-            ctx.eps = eps
 
         # ------------------------------------------------------
         # Cached state for backward pass is ready...
@@ -972,7 +971,6 @@ def wgrad_gemm(
                     ln_weight,
                     ctx.bwd_ln_sm_margin,
                     ctx.zero_centered_gamma,
-                    ctx.eps,
                 )
                 dgrad = dgrad.reshape(inputmat.size())
                 dbeta = None
diff --git a/transformer_engine/pytorch/ops/basic/rmsnorm.py b/transformer_engine/pytorch/ops/basic/rmsnorm.py
index 28126fd44f..05597a14fa 100644
--- a/transformer_engine/pytorch/ops/basic/rmsnorm.py
+++ b/transformer_engine/pytorch/ops/basic/rmsnorm.py
@@ -232,7 +232,6 @@ def op_backward(
             w,
             self._sm_margins["backward"],
             self.zero_centered_gamma,
-            self.eps,
         )
 
         # Clear saved tensors if possible
diff --git a/transformer_engine/pytorch/optimizers/__init__.py b/transformer_engine/pytorch/optimizers/__init__.py
index a19c797dea..e54a17ae78 100644
--- a/transformer_engine/pytorch/optimizers/__init__.py
+++ b/transformer_engine/pytorch/optimizers/__init__.py
@@ -13,4 +13,4 @@
 )
 from .fused_adam import FusedAdam
 from .fused_sgd import FusedSGD
-from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier
+from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier
\ No newline at end of file
diff --git a/transformer_engine/pytorch/optimizers/fused_adam.py b/transformer_engine/pytorch/optimizers/fused_adam.py
index b2ddd0adf8..18f7e2031a 100644
--- a/transformer_engine/pytorch/optimizers/fused_adam.py
+++ b/transformer_engine/pytorch/optimizers/fused_adam.py
@@ -711,7 +711,7 @@ def apply_multi_tensor_adam(adam_func, tensor_lists, inv_scale=None, out_dtype=N
                             self.multi_tensor_adam_param_remainder, tensor_lists
                         )
                     else:
-                        apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
+                        apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
                 if len(p_fp8_model) > 0:
                     tensor_lists = [
                         g_of_fp8_model,
@@ -731,14 +731,14 @@ def apply_multi_tensor_adam(adam_func, tensor_lists, inv_scale=None, out_dtype=N
                         m_of_f32_model,
                         v_of_f32_model,
                     ]
-                    apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
+                    apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
             else:  # self.master_weights=False and self.capturable=False
                 if len(p_f16_model) > 0:
                     tensor_lists = [g_of_f16_model, p_f16_model, m_of_f16_model, v_of_f16_model]
-                    apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
+                    apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
                 if len(p_f32_model) > 0:
                     tensor_lists = [g_of_f32_model, p_f32_model, m_of_f32_model, v_of_f32_model]
-                    apply_multi_tensor_adam(self.multi_tensor_adam(), tensor_lists)
+                    apply_multi_tensor_adam(self.multi_tensor_adam, tensor_lists)
 
             # Scaling
             for name in ["exp_avg", "exp_avg_sq", "master_param"]:

From f808816d4973f93b4df5426850afb1ee8d1b2336 Mon Sep 17 00:00:00 2001
From: yuzhuoLi <75082260+Darryl233@users.noreply.github.com>
Date: Mon, 2 Mar 2026 10:36:50 +0800
Subject: [PATCH 35/59] [CICD] Add workflows to validate TE QA test cases (#41)

# Description

Validate TE QA test cases with new CI workflows

## Type of change
- [ ] Documentation change (change only to the documentation, either a
fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [x] Infra/Build change
- [ ] Code refactoring

## Changes

Please list the changes introduced in this PR:

- Added code inspection and PyTorch/C++ unit tests to improve the TE
testing system
- Implemented end-to-end automation of TE wheel package building,
installation, and verification, supporting multiple versions of Flash
Attention and GPUs with different CUDA architectures
- Verified TE's core functions (distributed communication, matrix
multiplication, ONNX export) and compatibility with
Megatron-LM/Lightning-Thunder
- Completed the verification of the nvinspect debugging tool and
re-verification of core numerical tests

# Checklist:

- [ ] I have read and followed the [contributing
guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [x] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes

---------

Co-authored-by: zihugithub <fbye@baai.ac.cn>
Co-authored-by: liyuzhuo <lee.yuzhuo233@gamil.com>
---
 .github/workflows/blossom-ci.yml              |    4 +-
 .github/workflows/build.yml                   |   88 +-
 .github/workflows/deploy_nightly_docs.yml     |    2 +-
 .github/workflows/license.yml                 |    2 +-
 .github/workflows/qa-format.yml               |   32 +
 .github/workflows/qa-l0-pytorch-wheel.yml     |   78 ++
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |  187 +++
 .../workflows/qa-l1-te-cpp-pytorch-tests.yml  |  166 +++
 .../qa-l3-te-pytorch-fa-versions-test.yml     |  125 ++
 .github/workflows/scripts/gpu_check.sh        |   67 ++
 .github/workflows/te-plugin-tests.yml         |  107 ++
 .github/workflows/trigger-ci.yml              |    2 +-
 .pre-commit-config.yaml                       |   10 +-
 qa/L0_pytorch_debug_unittest/test.sh          |    6 +-
 qa/L0_pytorch_unittest/test.sh                |   48 +-
 qa/L0_pytorch_wheel/test.sh                   |    3 +
 qa/L1_pytorch_distributed_unittest/test.sh    |   14 +-
 qa/L1_pytorch_onnx_unittest/test.sh           |    3 +-
 setup.py                                      |   14 +-
 tests/README.md                               |   35 +
 transformer_engine/common/__init__.py         |    5 +-
 transformer_engine/plugin/__init__.py         |    2 +
 .../benchmarks/benchmark_all_backends.py      |  245 ++--
 transformer_engine/plugin/core/__init__.py    |    1 +
 .../plugin/core/_module_setup.py              |    4 +
 .../plugin/core/backends/__init__.py          |    2 +-
 .../plugin/core/backends/fa_utils.py          |   21 +-
 .../backends/flagos/attention/__init__.py     |    2 +-
 .../dot_product_attention/__init__.py         |    2 +-
 .../dot_product_attention/backends.py         |   14 +-
 .../plugin/core/backends/flagos/flagos.py     |   95 +-
 .../core/backends/flagos/impl/fused_adam.py   |   37 +-
 .../plugin/core/backends/flagos/impl/gemm.py  |    5 +-
 .../core/backends/flagos/impl/multi_tensor.py |    2 +-
 .../core/backends/flagos/register_ops.py      |   94 +-
 .../backends/reference/flash_attention.py     |   44 +-
 .../core/backends/reference/impl/__init__.py  |   35 +-
 .../backends/reference/impl/activation.py     |    8 +-
 .../core/backends/reference/impl/dropout.py   |    4 +-
 .../core/backends/reference/impl/gemm.py      |    4 +-
 .../backends/reference/impl/normalization.py  |    3 +
 .../core/backends/reference/impl/optimizer.py |   18 +-
 .../core/backends/reference/impl/softmax.py   |    4 +-
 .../core/backends/reference/reference.py      |  133 ++-
 .../core/backends/reference/register_ops.py   |  499 ++++++--
 .../plugin/core/backends/vendor/__init__.py   |    1 +
 .../core/backends/vendor/cuda/__init__.py     |    2 +-
 .../plugin/core/backends/vendor/cuda/cuda.py  |  438 +++++--
 .../backends/vendor/cuda/flash_attention.py   |   19 +-
 .../core/backends/vendor/cuda/register_ops.py | 1014 +++++++++++++---
 .../core/backends/vendor/hygon/__init__.py    |    2 +-
 .../backends/vendor/hygon/flash_attention.py  |   20 +-
 .../core/backends/vendor/hygon/hygon.py       |  431 +++++--
 .../backends/vendor/hygon/register_ops.py     |  942 +++++++++++++--
 .../core/backends/vendor/iluvatar/__init__.py |    2 +-
 .../core/backends/vendor/iluvatar/iluvatar.py |  432 +++++--
 .../backends/vendor/iluvatar/register_ops.py  | 1014 +++++++++++++---
 .../vendor/kunlunxin/flash_attention.py       |   32 +-
 .../backends/vendor/kunlunxin/kunlunxin.py    |   15 +-
 .../backends/vendor/kunlunxin/register_ops.py |   14 +-
 .../core/backends/vendor/metax/__init__.py    |    2 +-
 .../backends/vendor/metax/flash_attention.py  |   20 +-
 .../core/backends/vendor/metax/metax.py       |  433 +++++--
 .../backends/vendor/metax/register_ops.py     | 1015 ++++++++++++++---
 transformer_engine/plugin/core/builtin_ops.py |   15 +-
 transformer_engine/plugin/core/discovery.py   |   14 +-
 .../plugin/core/logger_manager.py             |    9 +-
 transformer_engine/plugin/core/manager.py     |   53 +-
 transformer_engine/plugin/core/ops.py         |  185 ++-
 transformer_engine/plugin/core/policy.py      |   33 +-
 transformer_engine/plugin/core/registry.py    |    6 +-
 .../plugin/examples/example_intree.py         |   18 +-
 .../plugin/examples/example_outtree.py        |   19 +-
 transformer_engine/plugin/test_utils.py       |   12 +-
 .../plugin/tests/run_all_tests.py             |   16 +-
 .../plugin/tests/test_activations.py          |  201 +++-
 .../plugin/tests/test_flash_attention.py      |  121 +-
 .../plugin/tests/test_normalization.py        |  107 +-
 .../plugin/tests/test_operations.py           |  137 ++-
 .../plugin/tests/test_optimizer.py            |  210 ++--
 .../plugin/tests/test_policy.py               |   46 +-
 .../plugin/tests/test_softmax.py              |   99 +-
 .../dot_product_attention.py                  |    2 +-
 .../pytorch/ops/basic/rmsnorm.py              |    1 -
 .../pytorch/optimizers/__init__.py            |    2 +-
 85 files changed, 7658 insertions(+), 1772 deletions(-)
 create mode 100644 .github/workflows/qa-format.yml
 create mode 100644 .github/workflows/qa-l0-pytorch-wheel.yml
 create mode 100644 .github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
 create mode 100644 .github/workflows/qa-l1-te-cpp-pytorch-tests.yml
 create mode 100644 .github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
 create mode 100644 .github/workflows/scripts/gpu_check.sh
 create mode 100644 .github/workflows/te-plugin-tests.yml
 create mode 100644 tests/README.md

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 1402cc091a..cc2f9eb9a8 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -3,10 +3,12 @@
 # See LICENSE for license information.
 
 # A workflow to trigger ci on hybrid infra (github + self hosted runner)
+
+# DISABLED in FlagOS
 name: Blossom-CI
 on:
   issue_comment:
-    types: [created]
+    types: [__disabled_do_not_remove__]
   workflow_dispatch:
       inputs:
           platform:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 506bc83f08..6c9c967950 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,90 +8,30 @@ on:
   pull_request:
   workflow_dispatch:
 jobs:
-  core:
-    name: 'Core'
-    runs-on: ubuntu-latest
-    container:
-      image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
-      options: --user root
-    steps:
-      - name: 'Dependencies'
-        run: |
-          apt-get update
-          apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake==3.21.0 pybind11[global] ninja nvidia-mathdx==25.1.1
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-      - name: 'Build'
-        run: pip install --no-build-isolation . -v
-        env:
-          NVTE_FRAMEWORK: none
-          MAX_JOBS: 1
-      - name: 'Sanity check'
-        run: python3 -c "import transformer_engine"
-        working-directory: /
   pytorch:
     name: 'PyTorch'
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    defaults:
+      run:
+        shell: bash
     container:
-      image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
       options: --user root
     steps:
-      - name: 'Dependencies'
-        run: |
-          apt-get update
-          apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
           submodules: recursive
       - name: 'Build'
-        run: pip install --no-build-isolation . -v --no-deps
+        run:
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+          pip install --no-build-isolation . -v --no-deps
         env:
           NVTE_FRAMEWORK: pytorch
-          MAX_JOBS: 1
-      - name: 'Sanity check'
-        run: python3 tests/pytorch/test_sanity_import.py
-  jax:
-    name: 'JAX'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/nvidia/jax:jax
-      options: --user root
-    steps:
-      - name: 'Dependencies'
-        run: pip install pybind11[global] nvidia-mathdx==25.1.1
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-      - name: 'Build'
-        run: pip install --no-build-isolation . -v
-        env:
-          NVTE_FRAMEWORK: jax
-          MAX_JOBS: 1
-      - name: 'Sanity check'
-        run: python3 tests/jax/test_sanity_import.py
-  all:
-    name: 'All'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/nvidia/jax:jax
-      options: --user root
-    steps:
-      - name: 'Dependencies'
-        run: pip install torch pybind11[global] einops onnxscript nvidia-mathdx==25.1.1
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-      - name: 'Build'
-        run: pip install --no-build-isolation . -v --no-deps
-        env:
-          NVTE_FRAMEWORK: all
-          MAX_JOBS: 1
+          TE_WITH_NCCL: 1
       - name: 'Sanity check'
-        run: python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py
+        run: 
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+          python3 tests/pytorch/test_sanity_import.py
diff --git a/.github/workflows/deploy_nightly_docs.yml b/.github/workflows/deploy_nightly_docs.yml
index 6470eee838..38a3e1dbc2 100644
--- a/.github/workflows/deploy_nightly_docs.yml
+++ b/.github/workflows/deploy_nightly_docs.yml
@@ -6,7 +6,7 @@
 name: Deploy nightly docs
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "__disabled_do_not_remove__" ]
 jobs:
   build:
     uses: ./.github/workflows/docs.yml
diff --git a/.github/workflows/license.yml b/.github/workflows/license.yml
index d70c7def61..3a2be6b1be 100644
--- a/.github/workflows/license.yml
+++ b/.github/workflows/license.yml
@@ -5,7 +5,7 @@
 # A workflow to trigger the TE license check on GitHub
 name: 'License'
 on:
-  pull_request:
+  pull_request: [__disabled_do_not_remove__]
   workflow_dispatch:
 jobs:
   check:
diff --git a/.github/workflows/qa-format.yml b/.github/workflows/qa-format.yml
new file mode 100644
index 0000000000..ff1cddf312
--- /dev/null
+++ b/.github/workflows/qa-format.yml
@@ -0,0 +1,32 @@
+name: format_check
+
+on:
+  pull_request:
+    branches: [ "main" ]
+    types: [opened, synchronize, reopened]
+
+jobs:
+  format:
+    runs-on: ubuntu-22.04
+    env:
+      PRID: ${{ github.event.pull_request.number }}
+      BRANCH: main
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+
+      - name: Merge PR to sub-branch
+        run: |
+          git fetch origin pull/${PRID}/merge
+          git checkout -b test FETCH_HEAD
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Run pre-commit
+        run: bash ./qa/format.sh
\ No newline at end of file
diff --git a/.github/workflows/qa-l0-pytorch-wheel.yml b/.github/workflows/qa-l0-pytorch-wheel.yml
new file mode 100644
index 0000000000..aef4396ae8
--- /dev/null
+++ b/.github/workflows/qa-l0-pytorch-wheel.yml
@@ -0,0 +1,78 @@
+name: QA Pytorch Wheel
+
+on:
+  push:
+    branches:
+      - __disabled_do_not_remove__
+  pull_request:
+    branches:
+      - __disabled_do_not_remove__
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  qa-l0-pytorch-wheel:
+    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull always
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v6.0.1
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          ssh-strict: true
+          ssh-user: git
+          persist-credentials: true
+          clean: true
+          sparse-checkout-cone-mode: true
+          fetch-tags: false
+          show-progress: true
+          lfs: false
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: L0 Pytorch Wheel
+        id: L0_pytoech_wheel
+        # timeout-minutes: 50
+        env:
+          TE_PATH: .
+          RUN_LOG: /logs/pytorch/wheel
+        run: |
+          echo "TE_PATH: ${TE_PATH}"
+          sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
+          sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh
+
+          cat qa/L0_pytorch_wheel/test.sh
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+          pip uninstall -y transformer_engine
+
+          bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log
+
+      - name: Upload Installation Logs
+        if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: L0-pytorch-logs-${{ github.run_id }}
+          path: /logs/pytorch/wheel
+          retention-days: 7
+          if-no-files-found: warn
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
new file mode 100644
index 0000000000..0ef8622c8a
--- /dev/null
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -0,0 +1,187 @@
+name: QA L0 - Core Unit & Lint Tests
+
+on:
+  push:
+    branches: main
+    paths:
+      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
+      - 'qa/L0_pytorch_lint/**'
+      - 'transformer_engine/**'
+      - 'tests/pytorch/**'
+  pull_request:
+    branches: main
+    paths:
+      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
+      - 'qa/L0_pytorch_lint/**'
+      - 'transformer_engine/**'
+      - 'tests/pytorch/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run-qa-l0-core-tests:
+    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull always
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v6.0.1
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          ssh-strict: true
+          ssh-user: git
+          persist-credentials: true
+          clean: true
+          sparse-checkout-cone-mode: true
+          fetch-tags: false
+          show-progress: true
+          lfs: false
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Install Dependencies & Build Transformer Engine
+        # timeout-minutes: 40
+        env:
+          NVTE_FRAMEWORK: pytorch
+          TE_WITH_NCCL: 1
+        run: |
+          # Activate conda environment
+          echo "=== Activating Conda Environment ==="
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Install Python dependencies with version pinning
+          echo "=== Installing Python Dependencies ==="
+          pip install transformers expecttest
+
+          # Build and install transformer_engine with verbose output
+          echo "=== Building & Installing Transformer Engine ==="
+          pip install --no-build-isolation -vvv . --no-deps
+
+          # Verify TE installation with version check
+          echo "=== Verifying Transformer Engine Installation ==="
+          python3 tests/pytorch/test_sanity_import.py
+
+      - name: Verify GPU Availability & Health
+        run: |
+          # Execute GPU check
+          echo "=== Checking GPU Status ==="
+          source .github/workflows/scripts/gpu_check.sh
+          wait_for_gpu
+
+      # too heavy, disabled for now
+      # - name: Run L0 C++ Unit Tests
+      #   # timeout-minutes: 60
+      #   env:
+      #     TE_PATH: .
+      #   run: |
+      #     # Activate conda environment
+      #     source /opt/miniconda3/etc/profile.d/conda.sh
+      #     conda activate flagscale-train
+
+      #     # Get TE library paths with robust detection
+      #     TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
+      #     TE_CPP_LIB_PATH="${TE_LIB_PATH}/transformer_engine"
+
+      #     # Set environment variables for build
+      #     export CMAKE_PREFIX_PATH="${TE_CPP_LIB_PATH}:${CMAKE_PREFIX_PATH}"
+      #     export LD_LIBRARY_PATH="${TE_CPP_LIB_PATH}:${LD_LIBRARY_PATH}"
+      #     NUM_PHYSICAL_CORES=$(nproc)
+      #     NUM_PARALLEL_JOBS=$(nproc)
+
+      #     # Build and run C++ tests
+      #     cd $TE_PATH/tests/cpp
+      #     cmake -GNinja -Bbuild . -DTE_LIB_PATH="${TE_CPP_LIB_PATH}"
+      #     cmake --build build
+      #     export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
+
+      #     # Run C++ tests with verbose output
+      #     echo "=== Running C++ Unit Tests ==="
+      #     ctest --test-dir build -j$NUM_PARALLEL_JOBS
+
+      - name: PyTorch C++ Lint
+        # timeout-minutes: 5
+        env:
+          CPP_ONLY: 1
+          TE_PATH: .
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Run C++ lint checks
+          echo "=== Running C++ Lint Checks ==="
+          bash ./qa/L0_pytorch_lint/test.sh || true
+
+          echo ""
+          echo "-----------------------------------------------------"
+          echo "Note: Pylint check ignores errors C0411 (incorrect import position) and W0611 (unused import), which can be achieved by adding the parameter --disable=C0411,W0611"
+          echo "-----------------------------------------------------"
+        continue-on-error: true
+
+      - name: PyTorch Python Lint
+        # timeout-minutes: 5
+        env:
+          PYTHON_ONLY: 1
+          TE_PATH: .
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Run PyTorch lint checks
+          echo "=== Running PyTorch Lint Checks ==="
+          bash ./qa/L0_pytorch_lint/test.sh || true
+
+          echo ""
+          echo "-----------------------------------------------------"
+          echo "Note: Pylint check ignores errors C0411 (incorrect import position) and W0611 (unused import), which can be achieved by adding the parameter --disable=C0411,W0611"
+          echo "-----------------------------------------------------"
+        continue-on-error: true
+
+      - name: Run L0 PyTorch Debug Unit Tests
+        # timeout-minutes: 10
+        env:
+          TE_PATH: .
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Run debug unit tests
+          echo "=== Running L0 PyTorch Debug Unit Tests ==="
+          bash ./qa/L0_pytorch_debug_unittest/test.sh
+
+      - name: Run L0 PyTorch Core Unit Tests
+        # timeout-minutes: 10
+        env:
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+          
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          # Run core unit tests
+          echo "=== Running L0 PyTorch Core Unit Tests ==="
+          bash ./qa/L0_pytorch_unittest/test.sh
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
new file mode 100644
index 0000000000..d0d15d7cf8
--- /dev/null
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -0,0 +1,166 @@
+name: QA L1 - Comprehensive Integration Tests
+
+on:
+  push:
+    branches: main
+    paths:
+      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
+      - 'qa/L1_cpp_distributed/**'
+      - 'tests/cpp_distributed/**'
+      - 'qa/L1_pytorch_thunder_integration/**'
+      - 'qa/L1_pytorch_distributed_unittest/**'
+      - 'tests/pytorch/distributed/**'
+      - 'tests/pytorch/attention/**'
+      - 'qa/L1_pytorch_onnx_unittest/**'
+      - 'tests/pytorch/test_onnx_export.py'
+
+  pull_request:
+    branches: main
+    paths:
+      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
+      - 'qa/L1_cpp_distributed/**'
+      - 'tests/cpp_distributed/**'
+      - 'qa/L1_pytorch_thunder_integration/**'
+      - 'qa/L1_pytorch_distributed_unittest/**'
+      - 'tests/pytorch/distributed/**'
+      - 'tests/pytorch/attention/**'
+      - 'qa/L1_pytorch_onnx_unittest/**'
+      - 'tests/pytorch/test_onnx_export.py'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run-qa-l1-comprehensive-tests:
+    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull always
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v6.0.1
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          ssh-strict: true
+          ssh-user: git
+          persist-credentials: true
+          clean: true
+          sparse-checkout-cone-mode: true
+          fetch-tags: false
+          show-progress: true
+          lfs: false
+          submodules: recursive
+          set-safe-directory: true
+      
+      - name: Install Dependencies & Build Transformer Engine
+        # timeout-minutes: 40
+        env:
+          NVTE_FRAMEWORK: pytorch
+          TE_WITH_NCCL: 1
+        run: |
+          # Activate conda environment
+          echo "=== Activating Conda Environment ==="
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Install MPI
+          apt update
+          apt install -y libopenmpi-dev openmpi-bin openmpi-common
+          apt install -y libmpich-dev mpich
+
+          # Verify the MPI header file
+          mpicxx -show | awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'
+
+          # Verify whether the MPI C++ environment is ready
+          # 1.  Verify whether the MPI C++ compiler (mpicxx) exists
+          mpicxx --version
+          # 2.  Verify if the MPI library file exists
+          ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so
+
+          # Install dependencies
+          pip install optree looseversion opt_einsum lightning_utilities
+
+          # Clone lightning-thunder
+          git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git
+
+          echo "Install transformer_engine"
+          pip install --no-build-isolation -vvv . --no-deps
+
+          # Verify installation
+          python3 tests/pytorch/test_sanity_import.py
+
+      - name: Verify GPU Availability & Health
+        run: |
+          # Execute GPU check
+          echo "=== Checking GPU Status ==="
+          source .github/workflows/scripts/gpu_check.sh
+          wait_for_gpu
+
+      - name: Run L1 PyTorch Thunder Integration Tests
+        env:
+          XML_LOG_DIR: "/logs/pytorch/thunder"
+          THUNDER_PATH: "lightning-thunder"
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          # Run thunder integration tests
+          echo "=== Running L1 PyTorch Thunder Integration Tests ==="
+          bash ./qa/L1_pytorch_thunder_integration/test.sh
+        # timeout-minutes: 5
+
+      - name: Run L1 PyTorch Distributed Unit Tests
+        continue-on-error: true
+        env:
+          XML_LOG_DIR: "/logs/pytorch/distributed"
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          # Run distributed unit tests
+          echo "=== Running L1 PyTorch Distributed Unit Tests ==="
+          bash ./qa/L1_pytorch_distributed_unittest/test.sh
+        # timeout-minutes: 5
+
+      - name: Run L1 PyTorch ONNX Unit Tests
+        env:
+          XML_LOG_DIR: "/logs/pytorch/onnx"
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          # Run ONNX unit tests
+          echo "=== Running L1 PyTorch ONNX Unit Tests ==="
+          bash ./qa/L1_pytorch_onnx_unittest/test.sh
+        # timeout-minutes: 30
diff --git a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
new file mode 100644
index 0000000000..9a881dd2d9
--- /dev/null
+++ b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
@@ -0,0 +1,125 @@
+# disabled for requireing hopper or higher Compute Capabilities GPUs
+name: QA L3 - Attention Tests 
+
+on:
+  push:
+    branches: __disable__
+    paths:
+      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
+      - 'tests/pytorch/attention/test_attention.py'
+
+  pull_request:
+    branches: __disable__
+    paths:
+      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
+      - 'tests/pytorch/attention/test_attention.py'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run-qa-l3-attention-tests:
+    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull always
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v6.0.1
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          ssh-strict: true
+          ssh-user: git
+          persist-credentials: true
+          clean: true
+          sparse-checkout-cone-mode: true
+          fetch-tags: false
+          show-progress: true
+          lfs: false
+          submodules: recursive
+          set-safe-directory: true
+      
+      - name: Install Dependencies & Build Transformer Engine
+        # timeout-minutes: 40
+        env:
+          NVTE_FRAMEWORK: pytorch
+          TE_WITH_NCCL: 1
+        run: |
+          # Activate conda environment
+          echo "=== Activating Conda Environment ==="
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # System dependencies installation with cleanup
+          echo "=== Installing System Dependencies (MPI) ==="
+          apt update
+          apt install -y libopenmpi-dev openmpi-bin openmpi-common
+          apt install -y libmpich-dev mpich
+
+          # Verify MPI installation comprehensively
+          echo "=== Verifying MPI Installation ==="
+          echo "MPI Compiler Path: $(which mpicxx)"
+          mpicxx --version
+          echo "MPI Header Paths:"
+          mpicxx -show | awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'
+
+          # Verify whether the MPI C++ environment is ready
+          # 1.  Verify whether the MPI C++ compiler (mpicxx) exists
+          mpicxx --version
+          # 2.  Verify if the MPI library file exists
+          ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so
+
+          # Install dependencies
+          pip install optree looseversion opt_einsum lightning_utilities
+
+          # Clone lightning-thunder
+          git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git
+
+          echo "Install transformer_engine"
+          pip install --no-build-isolation -vvv . --no-deps
+
+          # Verify installation
+          python3 tests/pytorch/test_sanity_import.py
+
+      - name: Verify GPU Availability & Health
+        run: |
+          # Execute GPU check
+          echo "=== Checking GPU Status ==="
+          source .github/workflows/scripts/gpu_check.sh
+          wait_for_gpu
+
+      - name: Run QA L3 PyTorch FlashAttention Versions Test
+        # timeout-minutes: 30
+        env:
+          XML_LOG_DIR: "/logs/pytorch/attention"
+          TE_PATH: .
+          MAX_JOBS: 32
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Create log directory with proper permissions
+          echo "=== Preparing Test Environment ==="
+          mkdir -p "$XML_LOG_DIR"
+          chmod 777 "$XML_LOG_DIR"
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          bash ./qa/L3_pytorch_FA_versions_test/test.sh
diff --git a/.github/workflows/scripts/gpu_check.sh b/.github/workflows/scripts/gpu_check.sh
new file mode 100644
index 0000000000..f7f533b95c
--- /dev/null
+++ b/.github/workflows/scripts/gpu_check.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Function to wait for GPU availability using nvidia-smi
+# This version uses integer arithmetic instead of bc for better compatibility
+wait_for_gpu_nvidia() {
+    local gpu_count
+    gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+    while true; do
+        local memory_usage_array=()
+        local memory_total_array=()
+        # Query GPU memory usage and total memory, suppress stderr to prevent exit on failure
+        mapfile -t memory_usage_array < <(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null)
+        mapfile -t memory_total_array < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null)
+
+        local need_wait=false
+        local max_usage_percent=0
+
+        # Iterate through each GPU to calculate memory usage percentage
+        for ((i=0; i<${#memory_usage_array[@]}; i++)); do
+            # Remove whitespace from nvidia-smi output
+            local memory_usage_i=${memory_usage_array[$i]// /}
+            local memory_total_i=${memory_total_array[$i]// /}
+
+            # Validate that memory values are numeric and total memory is greater than 0
+            if [[ $memory_usage_i =~ ^[0-9]+$ ]] && [[ $memory_total_i =~ ^[0-9]+$ ]] && [ "$memory_total_i" -gt 0 ]; then
+                # Calculate percentage using integer arithmetic (multiply by 100 first to avoid precision loss)
+                local usage_percent=$((memory_usage_i * 100 / memory_total_i))
+                # Track the maximum usage percentage across all GPUs
+                if [ $usage_percent -gt $max_usage_percent ]; then
+                    max_usage_percent=$usage_percent
+                fi
+            else
+                # Log warning for invalid values and continue waiting
+                echo "Warning: Invalid memory values - usage: '$memory_usage_i', total: '$memory_total_i'"
+                need_wait=true
+                break
+            fi
+        done
+
+        # If max usage percentage does not exceed 10%, we can proceed
+        # 10% threshold = 10 (since we're using integer percentages)
+        if [ "$need_wait" = false ] && [ $max_usage_percent -le 10 ]; then
+            break
+        fi
+
+        # Wait and show current status
+        echo "Waiting for GPU memory usage to drop below 50% (current max usage: ${max_usage_percent}%)"
+        sleep 1m
+    done
+
+    echo "All GPUs have sufficient free memory, GPU memory usage ratio is below 50% (current max usage: ${max_usage_percent}%)"
+}
+
+# Main function to detect GPU tool and call appropriate wait function
+# Future: Additional chip types can be added here by extending the detection logic
+# and implementing corresponding wait functions (e.g., wait_for_gpu_amd, wait_for_gpu_intel, etc.)
+wait_for_gpu() {
+    if command -v nvidia-smi &> /dev/null; then
+        echo "Detected nvidia-smi, using NVIDIA GPU monitoring"
+        wait_for_gpu_nvidia
+    else
+        echo "Error: Neither nvidia-smi nor mx-smi is available"
+        echo "Note: If you are using a new chip type, please add GPU idle detection method for your chip"
+        exit 1
+    fi
+}
diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml
new file mode 100644
index 0000000000..f487673444
--- /dev/null
+++ b/.github/workflows/te-plugin-tests.yml
@@ -0,0 +1,107 @@
+name: Plugin - Unit Tests
+
+on:
+  push:
+    branches: main
+    paths:
+      - 'transformer_engine/plugin/**'
+      - '.github/workflows/te-plugin-tests.yml'
+  pull_request:
+    branches: main
+    paths:
+      - 'transformer_engine/plugin/**'
+      - '.github/workflows/te-plugin-tests.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run-plugin-tests:
+    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull always
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v6.0.1
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          ssh-strict: true
+          ssh-user: git
+          persist-credentials: true
+          clean: true
+          sparse-checkout-cone-mode: true
+          fetch-tags: false
+          show-progress: true
+          lfs: false
+          submodules: recursive
+          set-safe-directory: true
+      
+      - name: Install Dependencies & Build Transformer Engine
+        # timeout-minutes: 40
+        env:
+          NVTE_FRAMEWORK: pytorch
+          TE_WITH_NCCL: 1
+        run: |
+          # Activate conda environment
+          echo "Activating conda environment..."
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Print environment information for debugging
+          echo "=== Environment Info ==="
+          conda info
+          python --version
+          pip --version
+          gcc --version
+          nvcc --version
+          cmake --version
+          cat /usr/local/cuda-12.8/include/cudnn_version.h | grep -E "CUDNN_MAJOR|CUDNN_MINOR|CUDNN_PATCHLEVEL"
+
+          # Install dependencies
+          echo "=== Installing Dependencies ==="
+          pip install transformers expecttest pytest
+
+          # Build and install transformer_engine
+          echo "=== Building Transformer Engine ==="
+          pip install --no-build-isolation -vvv . --no-deps
+
+          # Verify installation
+          echo "=== Verifying Installation ==="
+          python3 tests/pytorch/test_sanity_import.py
+          python3 -c "import transformer_engine; print('TE Version:', transformer_engine.__version__)"
+
+      - name: Verify GPU Availability & Health
+        run: |
+          # Execute GPU check
+          echo "=== Checking GPU Status ==="
+          source .github/workflows/scripts/gpu_check.sh
+          wait_for_gpu
+
+      - name: Plugin Test
+        # timeout-minutes: 10
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          # Execute tests (optimized parameters with enhanced output and error capture)
+          torchrun --nproc_per_node=8 -m pytest -q -x -p no:warnings transformer_engine/plugin/tests
+
+          echo "=== All Plugin Tests Completed Successfully ==="
diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
index f12a95d79a..37754fbfb7 100644
--- a/.github/workflows/trigger-ci.yml
+++ b/.github/workflows/trigger-ci.yml
@@ -6,7 +6,7 @@
 name: TE-CI Trigger
 on:
   issue_comment:
-    types: [created]
+    types: [__disabled_do_not_remove__]
 jobs:
   Authorization:
     name: Authorization
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5043d6ea22..d9bffbd999 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,8 +39,8 @@ repos:
         args: ["-style=file"]
         files: ^transformer_engine.*\.(c|cc|cxx|cpp|cu|cuh|h|hpp)$
 
-  - repo: https://github.com/netromdk/vermin
-    rev: c75aca72f4e85c6e47252139e8695f1c8b5f9ae3
-    hooks:
-      - id: vermin
-        args: ['-t=3.10', '--violations']
+  # - repo: https://github.com/netromdk/vermin
+  #   rev: c75aca72f4e85c6e47252139e8695f1c8b5f9ae3
+  #   hooks:
+  #     - id: vermin
+  #       args: ['-t=3.10', '--violations']
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 9980ccfb05..18199258c1 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -26,12 +26,12 @@ pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debu
 pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
-NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py -k "not (test_per_tensor_scaling or test_fake_quant or test_statistics_collection or test_statistics_multi_run)" --no-header --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
 pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
 
 
 # standard sanity and numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
+NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py  -k "not (test_sanity_grouped_linear or test_inference_mode)" --no-header || FAIL=1
+NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py -k "not (test_linear_accuracy or test_layernorm_linear_accuracy or test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_transformer_layer_hidden_states_format or test_grouped_gemm)" --no-header || FAIL=1
 
 exit $FAIL
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index b23ce3b6cf..9c5d9ac86f 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -24,30 +24,30 @@ mkdir -p "$XML_LOG_DIR"
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py || test_fail "test_jit.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8tensor.xml $TE_PATH/tests/pytorch/test_float8tensor.py || test_fail "test_float8tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
-NVTE_FLASH_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
-NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py -k "not (test_sanity_layernorm_mlp or test_sanity_gpt or test_sanity_bert or test_sanity_T5 or test_sanity_amp_and_nvfuser or test_sanity_drop_path or test_sanity_fused_qkv_params or test_sanity_gradient_accumulation_fusion or test_inference_mode or test_sanity_normalization_amp or test_sanity_layernorm_linear or test_sanity_linear_with_zero_tokens or test_sanity_grouped_linear)" --no-header || test_fail "test_sanity.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
+PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py -k "not (test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_gpt_cuda_graph or test_transformer_layer_hidden_states_format or test_grouped_gemm or test_noncontiguous or test_gpt_checkpointing or test_gpt_accuracy or test_mha_accuracy or test_linear_accuracy or test_linear_accuracy_delay_wgrad_compute or test_rmsnorm_accuracy or test_layernorm_accuracy or test_layernorm_linear_accuracy)" --no-header || test_fail "test_numerics.py"
+# PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py -k "not (test_torch_dynamo)" || test_fail "test_jit.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8tensor.xml $TE_PATH/tests/pytorch/test_float8tensor.py || test_fail "test_float8tensor.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py -k "not (test_basic_linear or test_layer_norm or test_rmsnorm or test_forward_linear_bias_activation or test_backward_add_rmsnorm or test_layernorm_mlp or test_activation or test_clamped_swiglu or test_dropout or test_forward_linear_bias_add or test_forward_linear_scale_add or test_linear)" || test_fail "test_fusible_ops.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py -k "not (test_permutation_index_map or test_permutation_single_case)" || test_fail "test_permutation.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
+# NVTE_FLASH_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py"
+python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
+# NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
+# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/qa/L0_pytorch_wheel/test.sh b/qa/L0_pytorch_wheel/test.sh
index 3056547ef2..b787b7cb95 100644
--- a/qa/L0_pytorch_wheel/test.sh
+++ b/qa/L0_pytorch_wheel/test.sh
@@ -27,6 +27,7 @@ VERSION=`cat $TE_PATH/build_tools/VERSION.txt`
 WHL_BASE="transformer_engine-${VERSION}"
 
 # Core wheel.
+rm -rf dist/*.whl 2>/dev/null || true  # Clean up any existing wheels
 NVTE_RELEASE_BUILD=1 pip3 wheel --no-build-isolation -vvv --wheel-dir ./dist . || error_exit "Failed to setup bdist_wheel"
 wheel unpack dist/${WHL_BASE}-* || error_exit "Failed to unpack dist/${WHL_BASE}-*.whl"
 sed -i "s/Name: transformer-engine/Name: transformer-engine-cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
@@ -44,6 +45,8 @@ pip3 install --no-build-isolation --no-deps -vvv dist/* || error_exit "Failed to
 cd $TE_PATH
 pip3 install --no-build-isolation --no-deps -vvv dist/*.whl || error_exit "Failed to install dist/*.whl --no-deps"
 
+export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
 python3 $TE_PATH/tests/pytorch/test_sanity_import.py || test_fail "test_sanity_import.py"
 
 if [ "$RET" -ne 0 ]; then
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index e698e997a6..04860a9729 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -28,14 +28,14 @@ pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
+# python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py || test_fail "test_torch_fsdp2.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
+# python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k "not (test_distributed)" || test_fail "test_torch_fsdp2.py"
+# python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
+# python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
+# python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
 
@@ -48,7 +48,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 : ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
 
-pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
+# pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
 NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
 
diff --git a/qa/L1_pytorch_onnx_unittest/test.sh b/qa/L1_pytorch_onnx_unittest/test.sh
index 7fce13a3dc..07abcbd7ef 100644
--- a/qa/L1_pytorch_onnx_unittest/test.sh
+++ b/qa/L1_pytorch_onnx_unittest/test.sh
@@ -5,9 +5,10 @@
 
 pip3 install onnxruntime
 pip3 install onnxruntime_extensions
+pip3 install tensorrt --index-url=https://pypi.tuna.tsinghua.edu.cn/simple
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_onnx_export.xml $TE_PATH/tests/pytorch/test_onnx_export.py
+python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_onnx_export.xml $TE_PATH/tests/pytorch/test_onnx_export.py -k "not (test_export_layernorm_mlp or test_export_layernorm_mlp_return_layernorm_output or test_export_layernorm_mlp_return_bias or test_export_layernorm_mlp_zero_centered_gamma or test_export_core_attention or test_export_multihead_attention_recipe or test_export_multihead_attention_no_input_layernorm or test_export_multihead_attention_cross_attn or test_export_multihead_attention_unfused_qkv_params or test_export_transformer_layer_recipe or test_export_transformer_layer_no_mask or test_export_transformer_layer_output_layernorm or test_export_transformer_layer_unfused_qkv_params or test_export_transformer_layer_zero_centered_gamma or test_export_transformer_layer_activation or test_export_gpt_generation or test_trt_integration)"
diff --git a/setup.py b/setup.py
index 0da2e45abf..7dc63fac0e 100644
--- a/setup.py
+++ b/setup.py
@@ -47,16 +47,14 @@
 def generate_build_config(skip_cuda_build):
     """Generate build-time configuration file."""
     config_template_path = (
-        current_file_path / "transformer_engine" / "plugin" /
-        "core" / "_build_config.py.template"
+        current_file_path / "transformer_engine" / "plugin" / "core" / "_build_config.py.template"
     )
     config_output_path = (
-        current_file_path / "transformer_engine" / "plugin" /
-        "core" / "_build_config.py"
+        current_file_path / "transformer_engine" / "plugin" / "core" / "_build_config.py"
     )
 
     if config_template_path.exists():
-        with open(config_template_path, 'r') as f:
+        with open(config_template_path, "r") as f:
             template = f.read()
 
         config_content = template.format(
@@ -65,7 +63,7 @@ def generate_build_config(skip_cuda_build):
             platform=platform.platform(),
         )
 
-        with open(config_output_path, 'w') as f:
+        with open(config_output_path, "w") as f:
             f.write(config_content)
 
         print(f"Generated build config: {config_output_path}")
@@ -77,7 +75,7 @@ def generate_build_config(skip_cuda_build):
 BUILD_TIME = "{datetime.now().isoformat()}"
 BUILD_PLATFORM = "{platform.platform()}"
 """
-        with open(config_output_path, 'w') as f:
+        with open(config_output_path, "w") as f:
             f.write(config_content)
         print(f"Generated minimal build config: {config_output_path}")
 
@@ -86,7 +84,7 @@ class CustomInstall(InstallCommand):
     """Custom install command to generate build config."""
 
     user_options = InstallCommand.user_options + [
-        ('skip-cuda-build', None, 'Skip CUDA build'),
+        ("skip-cuda-build", None, "Skip CUDA build"),
     ]
 
     def initialize_options(self):
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000000..600fcf223d
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,35 @@
+# TransformerEngine-FL Test Suite
+
+## Quick Start
+
+```bash
+# Run tests
+bash qa/<test_type>/test.sh
+```
+
+## Directory Structure
+
+```
+tests/
+├── cpp/                                  # C++ core functionality tests
+│   ├── operator/                         # C++ operator layer tests (basic/core operator validation)
+│   └── util/                             # C++ utility function tests (common helper unit tests)
+├── cpp_distributed/                      # C++ distributed functionality tests (communication/parallelism)
+├── jax/                                  # JAX framework adaptation tests (JAX backend validation)
+└── pytorch/                              # Full PyTorch framework tests
+    ├── attention/                        # PyTorch attention mechanism tests (FlashAttention/MLA etc.)
+    ├── debug/                            # Debug-specific tests (issue reproduction/debug tooling)
+    │   └── test_configs/                 # Debug test configurations (params/cases for different scenarios)
+    ├── distributed/                      # PyTorch distributed tests (DDP/FSDP/communication)
+    ├── nvfp4/                            # NVFP4 quantization tests (NVIDIA FP4 operator/inference)
+    └── references/                       # Reference implementation tests (consistency vs baseline)
+```
+
+## Adding Tests
+
+### Unit Test
+Add test file: 
+- `tests/cpp/test_<name>.cpp` & `tests/cpp/CMakeLists.txt`
+- `tests/cpp_distributed/test_<name>.py` & `tests/cpp_distributed/CMakeLists.txt`
+- `tests/jax/test_<name>.py`
+- `tests/pytorch/test_<name>.py`
diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index e3cb298963..f67b5d2470 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -31,17 +31,20 @@ def skip_cuda_build() -> bool:
     # Fall back to build-time configuration
     try:
         from transformer_engine.plugin.core._build_config import SKIP_CUDA_BUILD
+
         return SKIP_CUDA_BUILD
     except ImportError:
         # If build config doesn't exist, default to False
         return False
 
+
 # Load plugin system - this handles module registration and backend initialization
 # The _module_setup inside core will:
 # 1. Register modules under both full and short names for relative imports
 # 2. Load all available backends (flagos, reference, vendor/cuda, etc.)
 # 3. Register transformer_engine_torch module from the selected backend
-import transformer_engine.plugin.core  # noqa: F401
+import transformer_engine.plugin.core  # noqa: F401  # pylint: disable=wrong-import-position
+
 
 @functools.lru_cache(maxsize=None)
 def _is_package_installed(package) -> bool:
diff --git a/transformer_engine/plugin/__init__.py b/transformer_engine/plugin/__init__.py
index 478f9256b2..2c6533b713 100644
--- a/transformer_engine/plugin/__init__.py
+++ b/transformer_engine/plugin/__init__.py
@@ -9,11 +9,13 @@
     get_registry,
 )
 
+
 def __getattr__(name):
     if name == "tefl":
         return _get_tefl_module()
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
+
 __all__ = [
     "TEFLBackendBase",
     "TEFLModule",
diff --git a/transformer_engine/plugin/benchmarks/benchmark_all_backends.py b/transformer_engine/plugin/benchmarks/benchmark_all_backends.py
index fe03096551..f111cf0498 100644
--- a/transformer_engine/plugin/benchmarks/benchmark_all_backends.py
+++ b/transformer_engine/plugin/benchmarks/benchmark_all_backends.py
@@ -14,9 +14,18 @@
 
 
 class BenchmarkResult:
-    def __init__(self, backend_name: str, operation_name: str, shape: tuple,
-                 mean_time: float, std_time: float, min_time: float, max_time: float,
-                 gflops: float = None, bandwidth: float = None):
+    def __init__(
+        self,
+        backend_name: str,
+        operation_name: str,
+        shape: tuple,
+        mean_time: float,
+        std_time: float,
+        min_time: float,
+        max_time: float,
+        gflops: float = None,
+        bandwidth: float = None,
+    ):
         self.backend_name = backend_name
         self.operation_name = operation_name
         self.shape = shape
@@ -30,9 +39,11 @@ def __init__(self, backend_name: str, operation_name: str, shape: tuple,
     def __str__(self):
         gflops_str = f"{self.gflops:.2f} GFLOPS" if self.gflops else "N/A"
         bandwidth_str = f"{self.bandwidth:.2f} GB/s" if self.bandwidth else "N/A"
-        return (f"{self.backend_name:12s} {self.mean_time:8.4f}±{self.std_time:6.4f} ms  "
-                f"[{self.min_time:7.4f}, {self.max_time:7.4f}]  "
-                f"{gflops_str:15s} {bandwidth_str:12s}")
+        return (
+            f"{self.backend_name:12s} {self.mean_time:8.4f}±{self.std_time:6.4f} ms  "
+            f"[{self.min_time:7.4f}, {self.max_time:7.4f}]  "
+            f"{gflops_str:15s} {bandwidth_str:12s}"
+        )
 
 
 def time_operation(func, warmup_iters=10, benchmark_iters=100):
@@ -56,25 +67,25 @@ def time_operation(func, warmup_iters=10, benchmark_iters=100):
         times.append((end - start) * 1000)
 
     return {
-        'mean': np.mean(times),
-        'std': np.std(times),
-        'min': np.min(times),
-        'max': np.max(times),
+        "mean": np.mean(times),
+        "std": np.std(times),
+        "min": np.min(times),
+        "max": np.max(times),
     }
 
 
 def compute_gflops(operation: str, shape: tuple, time_ms: float) -> float:
-    if operation in ['gelu', 'relu', 'silu']:
+    if operation in ["gelu", "relu", "silu"]:
         flops = np.prod(shape) * 5
-    elif operation == 'layernorm':
+    elif operation == "layernorm":
         total_elements = np.prod(shape)
         hidden_size = shape[-1]
         flops = total_elements * (3 + 2 * hidden_size)
-    elif operation == 'rmsnorm':
+    elif operation == "rmsnorm":
         total_elements = np.prod(shape)
         hidden_size = shape[-1]
         flops = total_elements * (2 + hidden_size)
-    elif operation == 'gemm':
+    elif operation == "gemm":
         M, N, K = shape
         flops = 2 * M * N * K
     else:
@@ -86,29 +97,31 @@ def compute_gflops(operation: str, shape: tuple, time_ms: float) -> float:
 def compute_bandwidth(operation: str, shape: tuple, time_ms: float) -> float:
     bytes_per_element = 4
 
-    if operation in ['gelu', 'relu', 'silu']:
+    if operation in ["gelu", "relu", "silu"]:
         total_bytes = np.prod(shape) * 2 * bytes_per_element
-    elif operation in ['layernorm', 'rmsnorm']:
+    elif operation in ["layernorm", "rmsnorm"]:
         total_bytes = np.prod(shape) * 5 * bytes_per_element
-    elif operation == 'gemm':
+    elif operation == "gemm":
         M, N, K = shape
-        total_bytes = (M*K + K*N + M*N) * bytes_per_element
+        total_bytes = (M * K + K * N + M * N) * bytes_per_element
     else:
         return None
 
     return (total_bytes / 1e9) / (time_ms / 1000)
 
 
-def benchmark_activations(backends: List[str], shapes: List[tuple], device: str) -> List[BenchmarkResult]:
-    print("\n" + "="*80)
+def benchmark_activations(
+    backends: List[str], shapes: List[tuple], device: str
+) -> List[BenchmarkResult]:
+    print("\n" + "=" * 80)
     print("Activation Function Performance Test")
-    print("="*80)
+    print("=" * 80)
 
     results = []
     operations = [
-        ('gelu', 'GELU'),
-        ('relu', 'ReLU'),
-        ('silu', 'SiLU'),
+        ("gelu", "GELU"),
+        ("relu", "ReLU"),
+        ("silu", "SiLU"),
     ]
 
     for shape in shapes:
@@ -117,7 +130,9 @@ def benchmark_activations(backends: List[str], shapes: List[tuple], device: str)
 
         for op_method, op_name in operations:
             print(f"\n  {op_name}:")
-            print(f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+            print(
+                f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}"
+            )
             print(f"    {'-'*85}")
 
             for backend_name in backends:
@@ -127,13 +142,19 @@ def benchmark_activations(backends: List[str], shapes: List[tuple], device: str)
                     func = lambda: getattr(backend, op_method)(x, None)
                     timing = time_operation(func)
 
-                    gflops = compute_gflops(op_method, shape, timing['mean'])
-                    bandwidth = compute_bandwidth(op_method, shape, timing['mean'])
+                    gflops = compute_gflops(op_method, shape, timing["mean"])
+                    bandwidth = compute_bandwidth(op_method, shape, timing["mean"])
 
                     result = BenchmarkResult(
-                        backend_name, op_method, shape,
-                        timing['mean'], timing['std'], timing['min'], timing['max'],
-                        gflops, bandwidth
+                        backend_name,
+                        op_method,
+                        shape,
+                        timing["mean"],
+                        timing["std"],
+                        timing["min"],
+                        timing["max"],
+                        gflops,
+                        bandwidth,
                     )
                     results.append(result)
                     print(f"    {result}")
@@ -144,10 +165,12 @@ def benchmark_activations(backends: List[str], shapes: List[tuple], device: str)
     return results
 
 
-def benchmark_normalization(backends: List[str], shapes: List[tuple], device: str) -> List[BenchmarkResult]:
-    print("\n" + "="*80)
+def benchmark_normalization(
+    backends: List[str], shapes: List[tuple], device: str
+) -> List[BenchmarkResult]:
+    print("\n" + "=" * 80)
     print("Normalization Performance Test")
-    print("="*80)
+    print("=" * 80)
 
     results = []
     eps = 1e-5
@@ -160,23 +183,33 @@ def benchmark_normalization(backends: List[str], shapes: List[tuple], device: st
         bias = torch.zeros(hidden_size, dtype=torch.float32, device=device)
 
         print(f"\n  LayerNorm forward:")
-        print(f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+        print(
+            f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}"
+        )
         print(f"    {'-'*85}")
 
         for backend_name in backends:
             backend = get_backend(backend_name)
 
             try:
-                func = lambda: backend.layernorm_fwd(x, weight, bias, eps, None, None, torch.float32, 0, False)
+                func = lambda: backend.layernorm_fwd(
+                    x, weight, bias, eps, None, None, torch.float32, 0, False
+                )
                 timing = time_operation(func)
 
-                gflops = compute_gflops('layernorm', shape, timing['mean'])
-                bandwidth = compute_bandwidth('layernorm', shape, timing['mean'])
+                gflops = compute_gflops("layernorm", shape, timing["mean"])
+                bandwidth = compute_bandwidth("layernorm", shape, timing["mean"])
 
                 result = BenchmarkResult(
-                    backend_name, 'layernorm_fwd', shape,
-                    timing['mean'], timing['std'], timing['min'], timing['max'],
-                    gflops, bandwidth
+                    backend_name,
+                    "layernorm_fwd",
+                    shape,
+                    timing["mean"],
+                    timing["std"],
+                    timing["min"],
+                    timing["max"],
+                    gflops,
+                    bandwidth,
                 )
                 results.append(result)
                 print(f"    {result}")
@@ -185,23 +218,33 @@ def benchmark_normalization(backends: List[str], shapes: List[tuple], device: st
                 print(f"    {backend_name:12s} SKIPPED ({type(e).__name__})")
 
         print(f"\n  RMSNorm forward:")
-        print(f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+        print(
+            f"    {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}"
+        )
         print(f"    {'-'*85}")
 
         for backend_name in backends:
             backend = get_backend(backend_name)
 
             try:
-                func = lambda: backend.rmsnorm_fwd(x, weight, eps, None, None, torch.float32, 0, False)
+                func = lambda: backend.rmsnorm_fwd(
+                    x, weight, eps, None, None, torch.float32, 0, False
+                )
                 timing = time_operation(func)
 
-                gflops = compute_gflops('rmsnorm', shape, timing['mean'])
-                bandwidth = compute_bandwidth('rmsnorm', shape, timing['mean'])
+                gflops = compute_gflops("rmsnorm", shape, timing["mean"])
+                bandwidth = compute_bandwidth("rmsnorm", shape, timing["mean"])
 
                 result = BenchmarkResult(
-                    backend_name, 'rmsnorm_fwd', shape,
-                    timing['mean'], timing['std'], timing['min'], timing['max'],
-                    gflops, bandwidth
+                    backend_name,
+                    "rmsnorm_fwd",
+                    shape,
+                    timing["mean"],
+                    timing["std"],
+                    timing["min"],
+                    timing["max"],
+                    gflops,
+                    bandwidth,
                 )
                 results.append(result)
                 print(f"    {result}")
@@ -213,15 +256,17 @@ def benchmark_normalization(backends: List[str], shapes: List[tuple], device: st
 
 
 def benchmark_gemm(backends: List[str], configs: List[tuple], device: str) -> List[BenchmarkResult]:
-    print("\n" + "="*80)
+    print("\n" + "=" * 80)
     print("GEMM Performance Test")
-    print("="*80)
+    print("=" * 80)
 
     results = []
 
     for M, N, K in configs:
         print(f"\nConfig: M={M}, N={N}, K={K}")
-        print(f"  {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}")
+        print(
+            f"  {'Backend':<12s} {'Time (ms)':<20s} {'Range (ms)':<25s} {'GFLOPS':<15s} {'Bandwidth'}"
+        )
         print(f"  {'-'*85}")
 
         A = torch.randn(M, K, dtype=torch.float32, device=device)
@@ -234,20 +279,38 @@ def benchmark_gemm(backends: List[str], configs: List[tuple], device: str) -> Li
 
             try:
                 func = lambda: backend.generic_gemm(
-                    A, False, B, False, D,
-                    None, torch.float32, None, None,
-                    False, None, False,
-                    workspace, 1024, False, False
+                    A,
+                    False,
+                    B,
+                    False,
+                    D,
+                    None,
+                    torch.float32,
+                    None,
+                    None,
+                    False,
+                    None,
+                    False,
+                    workspace,
+                    1024,
+                    False,
+                    False,
                 )
                 timing = time_operation(func)
 
-                gflops = compute_gflops('gemm', (M, N, K), timing['mean'])
-                bandwidth = compute_bandwidth('gemm', (M, N, K), timing['mean'])
+                gflops = compute_gflops("gemm", (M, N, K), timing["mean"])
+                bandwidth = compute_bandwidth("gemm", (M, N, K), timing["mean"])
 
                 result = BenchmarkResult(
-                    backend_name, 'gemm', (M, N, K),
-                    timing['mean'], timing['std'], timing['min'], timing['max'],
-                    gflops, bandwidth
+                    backend_name,
+                    "gemm",
+                    (M, N, K),
+                    timing["mean"],
+                    timing["std"],
+                    timing["min"],
+                    timing["max"],
+                    gflops,
+                    bandwidth,
                 )
                 results.append(result)
                 print(f"  {result}")
@@ -259,11 +322,12 @@ def benchmark_gemm(backends: List[str], configs: List[tuple], device: str) -> Li
 
 
 def print_summary(all_results: List[BenchmarkResult]):
-    print("\n" + "="*80)
+    print("\n" + "=" * 80)
     print("Performance Comparison Summary")
-    print("="*80)
+    print("=" * 80)
 
     from collections import defaultdict
+
     by_operation = defaultdict(lambda: defaultdict(list))
 
     for result in all_results:
@@ -271,7 +335,7 @@ def print_summary(all_results: List[BenchmarkResult]):
 
     print("\nAverage Performance (all shapes):")
     print(f"{'Operation':<20s} {'Backend':<12s} {'Avg Time (ms)':<15s} {'Avg GFLOPS':<15s}")
-    print("-"*65)
+    print("-" * 65)
 
     for op_name, backends_data in sorted(by_operation.items()):
         for backend_name, results in sorted(backends_data.items()):
@@ -282,9 +346,9 @@ def print_summary(all_results: List[BenchmarkResult]):
             gflops_str = f"{avg_gflops:.2f}" if avg_gflops else "N/A"
             print(f"{op_name:<20s} {backend_name:<12s} {avg_time:<15.4f} {gflops_str:<15s}")
 
-    print("\n" + "="*80)
+    print("\n" + "=" * 80)
     print("Fastest Backend (by operation)")
-    print("="*80)
+    print("=" * 80)
 
     for op_name, backends_data in sorted(by_operation.items()):
         backend_avg_times = {}
@@ -299,33 +363,44 @@ def print_summary(all_results: List[BenchmarkResult]):
 def save_results_csv(results: List[BenchmarkResult], filename: str):
     import csv
 
-    with open(filename, 'w', newline='') as f:
+    with open(filename, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow([
-            'Backend', 'Operation', 'Shape', 'Mean(ms)', 'Std(ms)',
-            'Min(ms)', 'Max(ms)', 'GFLOPS', 'GB/s'
-        ])
+        writer.writerow(
+            [
+                "Backend",
+                "Operation",
+                "Shape",
+                "Mean(ms)",
+                "Std(ms)",
+                "Min(ms)",
+                "Max(ms)",
+                "GFLOPS",
+                "GB/s",
+            ]
+        )
 
         for result in results:
-            writer.writerow([
-                result.backend_name,
-                result.operation_name,
-                str(result.shape),
-                f"{result.mean_time:.4f}",
-                f"{result.std_time:.4f}",
-                f"{result.min_time:.4f}",
-                f"{result.max_time:.4f}",
-                f"{result.gflops:.2f}" if result.gflops else "N/A",
-                f"{result.bandwidth:.2f}" if result.bandwidth else "N/A",
-            ])
+            writer.writerow(
+                [
+                    result.backend_name,
+                    result.operation_name,
+                    str(result.shape),
+                    f"{result.mean_time:.4f}",
+                    f"{result.std_time:.4f}",
+                    f"{result.min_time:.4f}",
+                    f"{result.max_time:.4f}",
+                    f"{result.gflops:.2f}" if result.gflops else "N/A",
+                    f"{result.bandwidth:.2f}" if result.bandwidth else "N/A",
+                ]
+            )
 
     print(f"\nResults saved to: {filename}")
 
 
 def main():
-    print("\n" + "="*80)
-    print(" "*25 + "Multi-Backend Performance Comparison Test")
-    print("="*80)
+    print("\n" + "=" * 80)
+    print(" " * 25 + "Multi-Backend Performance Comparison Test")
+    print("=" * 80)
 
     device = "cpu"
     if torch.cuda.is_available():
@@ -381,9 +456,9 @@ def main():
 
     save_results_csv(all_results, f"{output_dir}/all_results.csv")
 
-    print("\n" + "="*80)
+    print("\n" + "=" * 80)
     print("Testing complete!")
-    print("="*80 + "\n")
+    print("=" * 80 + "\n")
 
     return 0
 
diff --git a/transformer_engine/plugin/core/__init__.py b/transformer_engine/plugin/core/__init__.py
index a4d4b2a139..21a94e5f1e 100644
--- a/transformer_engine/plugin/core/__init__.py
+++ b/transformer_engine/plugin/core/__init__.py
@@ -51,6 +51,7 @@
 
 # Setup module aliases BEFORE importing backends to support relative imports
 from ._module_setup import setup_module_aliases, register_as_transformer_engine_torch
+
 setup_module_aliases()
 
 # Import backends - this loads all available backends (flagos, reference, vendor/cuda, etc.)
diff --git a/transformer_engine/plugin/core/_module_setup.py b/transformer_engine/plugin/core/_module_setup.py
index 20ef221806..74acad26cc 100644
--- a/transformer_engine/plugin/core/_module_setup.py
+++ b/transformer_engine/plugin/core/_module_setup.py
@@ -60,6 +60,7 @@ def setup_module_aliases():
     # Register parent plugin package if needed
     if "transformer_engine.plugin" not in sys.modules:
         import types
+
         plugin_dir = Path(__file__).parent.parent
         plugin_pkg = types.ModuleType("transformer_engine.plugin")
         plugin_pkg.__path__ = [str(plugin_dir)]
@@ -79,16 +80,19 @@ def register_as_transformer_engine_torch():
 
     try:
         from .ops import get_tefl_module
+
         tefl_module = get_tefl_module()
         sys.modules["transformer_engine_torch"] = tefl_module
     except Exception as e:
         import traceback
+
         print(f"[TEFL Setup] Warning: Could not register transformer_engine_torch: {e}")
         traceback.print_exc()
 
         # Create a minimal placeholder module to avoid import errors
         # This allows the system to at least import without crashing
         import types
+
         placeholder = types.ModuleType("transformer_engine_torch")
         placeholder.__doc__ = "Placeholder module - TEFL backend not available"
         sys.modules["transformer_engine_torch"] = placeholder
diff --git a/transformer_engine/plugin/core/backends/__init__.py b/transformer_engine/plugin/core/backends/__init__.py
index 88988bab64..7729afc3af 100644
--- a/transformer_engine/plugin/core/backends/__init__.py
+++ b/transformer_engine/plugin/core/backends/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2025, BAAI. All rights reserved.
 #
-# See LICENSE for license information.
\ No newline at end of file
+# See LICENSE for license information.
diff --git a/transformer_engine/plugin/core/backends/fa_utils.py b/transformer_engine/plugin/core/backends/fa_utils.py
index 1107de757a..c24b377631 100644
--- a/transformer_engine/plugin/core/backends/fa_utils.py
+++ b/transformer_engine/plugin/core/backends/fa_utils.py
@@ -80,8 +80,11 @@ def reduce_scatter_along_seq(
     chunk_size = seq_len // world_size
 
     output = torch.empty(
-        *tensor.shape[:seq_dim], chunk_size, *tensor.shape[seq_dim + 1:],
-        dtype=tensor.dtype, device=tensor.device
+        *tensor.shape[:seq_dim],
+        chunk_size,
+        *tensor.shape[seq_dim + 1 :],
+        dtype=tensor.dtype,
+        device=tensor.device
     )
 
     dist.reduce_scatter_tensor(output, tensor, group=cp_group)
@@ -114,12 +117,14 @@ def create_cp_causal_mask(
     q_start = cp_rank * local_seq_len_q
 
     # Create position indices
-    q_indices = torch.arange(local_seq_len_q, device=device, dtype=torch.long).unsqueeze(1) + q_start
+    q_indices = (
+        torch.arange(local_seq_len_q, device=device, dtype=torch.long).unsqueeze(1) + q_start
+    )
     kv_indices = torch.arange(full_seq_len_kv, device=device, dtype=torch.long).unsqueeze(0)
 
     # Create causal mask: mask out positions where kv_idx > q_idx
     causal_mask = torch.zeros(local_seq_len_q, full_seq_len_kv, dtype=dtype, device=device)
-    causal_mask.masked_fill_(kv_indices > q_indices, float('-inf'))
+    causal_mask.masked_fill_(kv_indices > q_indices, float("-inf"))
 
     return causal_mask
 
@@ -151,16 +156,18 @@ def create_cp_window_mask(
     q_start = cp_rank * local_seq_len_q
 
     # Create position indices
-    q_indices = torch.arange(local_seq_len_q, device=device, dtype=torch.long).unsqueeze(1) + q_start
+    q_indices = (
+        torch.arange(local_seq_len_q, device=device, dtype=torch.long).unsqueeze(1) + q_start
+    )
     kv_indices = torch.arange(full_seq_len_kv, device=device, dtype=torch.long).unsqueeze(0)
 
     # Create window mask
     window_mask = torch.zeros(local_seq_len_q, full_seq_len_kv, dtype=dtype, device=device)
 
     if left_window >= 0:
-        window_mask.masked_fill_(kv_indices < q_indices - left_window, float('-inf'))
+        window_mask.masked_fill_(kv_indices < q_indices - left_window, float("-inf"))
     if right_window >= 0:
-        window_mask.masked_fill_(kv_indices > q_indices + right_window, float('-inf'))
+        window_mask.masked_fill_(kv_indices > q_indices + right_window, float("-inf"))
 
     return window_mask
 
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/__init__.py b/transformer_engine/plugin/core/backends/flagos/attention/__init__.py
index 88988bab64..7729afc3af 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/__init__.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2025, BAAI. All rights reserved.
 #
-# See LICENSE for license information.
\ No newline at end of file
+# See LICENSE for license information.
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py
index 88988bab64..7729afc3af 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2025, BAAI. All rights reserved.
 #
-# See LICENSE for license information.
\ No newline at end of file
+# See LICENSE for license information.
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index ea3c9c002a..8f2e9aeb41 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -70,8 +70,7 @@ def forward(
 
         max_logit = None
 
-        is_causal = attn_mask_type == 'causal'
-
+        is_causal = attn_mask_type == "causal"
 
         q_permuted = q.permute(1, 2, 0, 3).contiguous()
         k_permuted = k.permute(1, 2, 0, 3).contiguous()
@@ -160,11 +159,12 @@ def backward(ctx, d_out, *_args):
 
             dqkv_te_dtype = TE_DType[d_out.dtype]
 
-
             q_permuted = q_permuted.contiguous() if not q_permuted.is_contiguous() else q_permuted
             k_permuted = k_permuted.contiguous() if not k_permuted.is_contiguous() else k_permuted
             v_permuted = v_permuted.contiguous() if not v_permuted.is_contiguous() else v_permuted
-            out_permuted = out_permuted.contiguous() if not out_permuted.is_contiguous() else out_permuted
+            out_permuted = (
+                out_permuted.contiguous() if not out_permuted.is_contiguous() else out_permuted
+            )
             m = m.contiguous() if not m.is_contiguous() else m
 
             # d_out is (seq, batch, heads, dim) from autograd, permute to (batch, heads, seq, dim)
@@ -285,9 +285,7 @@ def _forward_impl(
         assert (
             query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
         ), "FLAttention only supports CUDA tensors."
-        assert (
-            qkv_layout in QKVLayouts
-        ), f"FLAttention does not support qkv_layout = {qkv_layout}!"
+        assert qkv_layout in QKVLayouts, f"FLAttention does not support qkv_layout = {qkv_layout}!"
 
         cp_size = 1
         if isinstance(cp_group, dist_group_type):
@@ -381,4 +379,4 @@ def _forward_impl(
                 self.layer_number,
             )
 
-        return output.view(*output.shape[:-2], -1)
\ No newline at end of file
+        return output.view(*output.shape[:-2], -1)
diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index 03f7c2ed7e..fd8a61f492 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -10,16 +10,20 @@
 from ...ops import *
 
 from .impl import (
-    rmsnorm_fwd_fl, rmsnorm_bwd_fl,
-    multi_tensor_scale_fl, multi_tensor_adam_fl,
+    rmsnorm_fwd_fl,
+    rmsnorm_bwd_fl,
+    multi_tensor_scale_fl,
+    multi_tensor_adam_fl,
     multi_tensor_adam_param_remainder_fl,
     multi_tensor_l2_norm_fl,
-    generic_gemm_fl
+    generic_gemm_fl,
 )
 
+
 def _check_flagos_available() -> bool:
     return True
 
+
 class FlagOSBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -31,6 +35,7 @@ def is_available(self) -> bool:
     def get_attention_backend(self, attention_params=None):
         from packaging.version import Version as PkgVersion
         from ...logger_manager import get_logger
+
         logger = get_logger()
 
         # Read environment variables to determine which backends to enable
@@ -60,7 +65,7 @@ def get_attention_backend(self, attention_params=None):
             available_backends,
         )
 
-##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def generic_gemm(
         self,
         A: Any,
@@ -87,10 +92,28 @@ def generic_gemm(
         beta: Optional[float] = None,
     ) -> List[Any]:
         return generic_gemm_fl(
-            A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
-            accumulate, use_split_accumulator, comm_overlap, comm_type,
-            extra_output, bulk_overlap, alpha, beta
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
         )
 
     # Other granular functions
@@ -106,10 +129,16 @@ def rmsnorm_fwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         return rmsnorm_fwd_fl(
-            input=input, weight=weight, eps=eps, ln_out=ln_out,
-            quantizer=quantizer, odtype=otype,
-            sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma,
+            input=input,
+            weight=weight,
+            eps=eps,
+            ln_out=ln_out,
+            quantizer=quantizer,
+            odtype=otype,
+            sm_margin=sm_margin,
+            zero_centered_gamma=zero_centered_gamma,
         )
+
     def rmsnorm_bwd(
         self,
         dz: torch.Tensor,
@@ -120,9 +149,14 @@ def rmsnorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         return rmsnorm_bwd_fl(
-            dy=dz, x=x, rsigma=rsigma, gamma=gamma,
-            sm_margin=sm_margin, zero_centered_gamma=zero_centered_gamma
+            dy=dz,
+            x=x,
+            rsigma=rsigma,
+            gamma=gamma,
+            sm_margin=sm_margin,
+            zero_centered_gamma=zero_centered_gamma,
         )
+
     def get_fused_attn_backend(self, *args, **kwargs) -> int:
         return NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
@@ -135,6 +169,7 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         return multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -143,6 +178,7 @@ def multi_tensor_l2norm(
         per_tensor: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         return multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor)
+
     def multi_tensor_adam(
         self,
         chunk_size: int,
@@ -158,9 +194,19 @@ def multi_tensor_adam(
         weight_decay: float,
     ) -> None:
         return multi_tensor_adam_fl(
-            chunk_size, noop_flag, tensor_lists, lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_param_remainder(
         self,
         chunk_size: int,
@@ -176,20 +222,31 @@ def multi_tensor_adam_param_remainder(
         weight_decay: float,
     ) -> None:
         return multi_tensor_adam_param_remainder_fl(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
 
     # Misc
     def get_cublasLt_version(self) -> int:
         return 110000
+
     def get_cudnn_version(self) -> int:
         return 90000
+
     def get_num_cublas_streams(self) -> int:
         return 0
 
-############## class func #################################
+    ############## class func #################################
     def get_flash_attention_class(self):
         from .attention.dot_product_attention.backends import FlashAttentionFL
+
         return FlashAttentionFL
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index 89107b04c2..f148795381 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -35,10 +35,10 @@ def multi_tensor_adam_fl(
     bias_correction1 = 1.0
     bias_correction2 = 1.0
     if bias_correction == 1:
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
 
-    is_adamw = (mode == 1)
+    is_adamw = mode == 1
 
     for i in range(num_tensors):
         g = tensor_lists[0][i]
@@ -53,8 +53,10 @@ def multi_tensor_adam_fl(
         if inv_scale is not None and inv_scale != 1.0:
             g = flag_gems.mul(g, inv_scale)
 
-        m = flag_gems.add_(flag_gems.mul_(m, beta1), g, alpha=1-beta1)
-        v = flag_gems.add_(flag_gems.mul_(v, beta2), flag_gems.mul_(flag_gems.mul_(g, g), 1 - beta2))
+        m = flag_gems.add_(flag_gems.mul_(m, beta1), g, alpha=1 - beta1)
+        v = flag_gems.add_(
+            flag_gems.mul_(v, beta2), flag_gems.mul_(flag_gems.mul_(g, g), 1 - beta2)
+        )
 
         m_corr = m.clone()
         v_corr = v.clone()
@@ -126,10 +128,10 @@ def multi_tensor_adam_param_remainder_fl(
     bias_correction1 = 1.0
     bias_correction2 = 1.0
     if bias_correction == 1:
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
 
-    is_adamw = (mode == 1)
+    is_adamw = mode == 1
 
     for i in range(num_tensors):
         g = tensor_lists[0][i]
@@ -148,16 +150,21 @@ def multi_tensor_adam_param_remainder_fl(
         # Reconstruct FP32 master weight from BF16 param + int16 remainder
         # The remainder represents the lower 16 bits lost in BF16 conversion
         param_fp32 = p.float()
-        param_master = flag_gems.add(param_fp32, flag_gems.mul(p_remainder.float(), 2.0 ** -16))
+        param_master = flag_gems.add(param_fp32, flag_gems.mul(p_remainder.float(), 2.0**-16))
 
         # Compute gradient with weight decay (if L2 mode)
         grad_with_decay = g.float()
         if not is_adamw:  # L2 regularization mode
-            grad_with_decay = flag_gems.add(grad_with_decay, flag_gems.mul(param_master, weight_decay))
+            grad_with_decay = flag_gems.add(
+                grad_with_decay, flag_gems.mul(param_master, weight_decay)
+            )
 
         # Update moments
         m = flag_gems.add_(flag_gems.mul_(m, beta1), grad_with_decay, alpha=1 - beta1)
-        v = flag_gems.add_(flag_gems.mul_(v, beta2), flag_gems.mul_(flag_gems.mul_(grad_with_decay, grad_with_decay), 1 - beta2))
+        v = flag_gems.add_(
+            flag_gems.mul_(v, beta2),
+            flag_gems.mul_(flag_gems.mul_(grad_with_decay, grad_with_decay), 1 - beta2),
+        )
 
         # Apply bias correction
         m_corr = m.clone()
@@ -182,9 +189,11 @@ def multi_tensor_adam_param_remainder_fl(
 
         # Compute remainder: difference between FP32 master and BF16 representation
         # Scale and quantize to int16 range
-        remainder_fp32 = flag_gems.mul(flag_gems.sub(param_master, param_bf16.float()), 2.0 ** 16)
-        remainder_int16 = flag_gems.clamp(torch.round(remainder_fp32), -32768, 32767).to(dtype=torch.int16)
+        remainder_fp32 = flag_gems.mul(flag_gems.sub(param_master, param_bf16.float()), 2.0**16)
+        remainder_int16 = flag_gems.clamp(torch.round(remainder_fp32), -32768, 32767).to(
+            dtype=torch.int16
+        )
 
         # Write back
         flag_gems.copy_(p, param_bf16)
-        flag_gems.copy_(p_remainder, remainder_int16)
\ No newline at end of file
+        flag_gems.copy_(p_remainder, remainder_int16)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
index 709c107a57..05aea25092 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
@@ -22,6 +22,7 @@
     8: torch.float8_e5m2,
 }
 
+
 def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
     if required:
         return scale if scale is not None else 1.0
@@ -29,6 +30,7 @@ def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
         raise ValueError("scale must be zero")
     return 0.0
 
+
 def _convert_dtype(dtype: Union[int, torch.dtype, None]) -> Optional[torch.dtype]:
     if dtype is None:
         return None
@@ -36,10 +38,11 @@ def _convert_dtype(dtype: Union[int, torch.dtype, None]) -> Optional[torch.dtype
         return dtype
     if isinstance(dtype, int):
         return _DTYPE_TO_TORCH.get(dtype, None)
-    if hasattr(dtype, 'value'):
+    if hasattr(dtype, "value"):
         return _DTYPE_TO_TORCH.get(dtype.value, None)
     return None
 
+
 def generic_gemm_fl(
     A: torch.Tensor,
     transA: bool,
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
index d7361fd7ed..4421487ff1 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -23,4 +23,4 @@ def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *ar
 def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
 
     for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-        flag_gems.copy_(dst, src * scale)
\ No newline at end of file
+        flag_gems.copy_(dst, src * scale)
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
index e92e0864e0..0136b6a983 100644
--- a/transformer_engine/plugin/core/backends/flagos/register_ops.py
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -40,20 +42,88 @@ def register_builtins(registry) -> None:
     is_avail = backend.is_available
 
     impls = [
-        OpImpl(op_name="rmsnorm_fwd", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="rmsnorm_bwd", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="generic_gemm", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="multi_tensor_scale", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="multi_tensor_adam", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="multi_tensor_l2norm", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor=None, priority=150),
-
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor=None,
+            priority=150,
+        ),
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=150),
-
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor=None,
+            priority=150,
+        ),
         # Attention backend selection
-        OpImpl(op_name="get_attention_backend", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor=None, priority=150),
-        OpImpl(op_name="get_fused_attn_backend", impl_id="default.flagos", kind=BackendImplKind.DEFAULT, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor=None, priority=150),
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="get_fused_attn_backend",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.get_fused_attn_backend, is_avail),
+            vendor=None,
+            priority=150,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/reference/flash_attention.py b/transformer_engine/plugin/core/backends/reference/flash_attention.py
index 62c652b856..9a8b9e932b 100644
--- a/transformer_engine/plugin/core/backends/reference/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/reference/flash_attention.py
@@ -115,7 +115,7 @@ def _create_sliding_window_mask(
             mask_bool = mask_bool | (kv_idx > q_idx + right_window)
 
         mask = torch.zeros(seq_len_q, seq_len_kv, dtype=dtype, device=device)
-        mask.masked_fill_(mask_bool, float('-inf'))
+        mask.masked_fill_(mask_bool, float("-inf"))
 
         return mask
 
@@ -136,7 +136,7 @@ def _unpack_tensor(
             else:
                 raise ValueError(
                     f"Unexpected 4D tensor shape {original_shape}. "
-                    f"Expected [total_tokens, 1, num_heads, head_dim]"
+                    "Expected [total_tokens, 1, num_heads, head_dim]"
                 )
 
         if tensor.dim() != 3:
@@ -153,8 +153,7 @@ def _unpack_tensor(
             )
 
         padded_tensor = torch.zeros(
-            batch_size, num_heads, max_seqlen, head_dim,
-            dtype=tensor.dtype, device=device
+            batch_size, num_heads, max_seqlen, head_dim, dtype=tensor.dtype, device=device
         )
 
         padding_mask = torch.ones(batch_size, max_seqlen, dtype=torch.bool, device=device)
@@ -185,8 +184,7 @@ def _pack_tensor(
         device = tensor.device
 
         packed_tensor = torch.zeros(
-            total_tokens, num_heads, head_dim,
-            dtype=tensor.dtype, device=device
+            total_tokens, num_heads, head_dim, dtype=tensor.dtype, device=device
         )
 
         # Vectorized packing - avoid repeated .item() calls
@@ -255,12 +253,16 @@ def _forward_impl(
 
             if use_packed_format:
                 if cu_seqlens_q is not None:
-                    query, padding_mask_q = self._unpack_tensor(query_layer, cu_seqlens_q, max_seqlen_q)
+                    query, padding_mask_q = self._unpack_tensor(
+                        query_layer, cu_seqlens_q, max_seqlen_q
+                    )
                 else:
                     query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
 
                 if cu_seqlens_kv is not None:
-                    key, padding_mask_kv = self._unpack_tensor(key_layer, cu_seqlens_kv, max_seqlen_kv)
+                    key, padding_mask_kv = self._unpack_tensor(
+                        key_layer, cu_seqlens_kv, max_seqlen_kv
+                    )
                     value, _ = self._unpack_tensor(value_layer, cu_seqlens_kv, max_seqlen_kv)
                 else:
                     key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
@@ -285,7 +287,8 @@ def _forward_impl(
             num_groups = num_heads_q // num_heads_kv
             if num_heads_q % num_heads_kv != 0:
                 raise ValueError(
-                    f"num_heads_q ({num_heads_q}) must be divisible by num_heads_kv ({num_heads_kv})"
+                    f"num_heads_q ({num_heads_q}) must be divisible by num_heads_kv"
+                    f" ({num_heads_kv})"
                 )
             key = key.repeat_interleave(num_groups, dim=1)
             value = value.repeat_interleave(num_groups, dim=1)
@@ -295,11 +298,10 @@ def _forward_impl(
 
         if use_packed_format and padding_mask_kv is not None:
             attn_mask = torch.zeros(
-                batch_size, seq_len_q, seq_len_kv,
-                dtype=query.dtype, device=query.device
+                batch_size, seq_len_q, seq_len_kv, dtype=query.dtype, device=query.device
             )
             padding_broadcast = padding_mask_kv.unsqueeze(1)
-            attn_mask.masked_fill_(padding_broadcast, float('-inf'))
+            attn_mask.masked_fill_(padding_broadcast, float("-inf"))
 
         if attn_mask_type == "causal":
             if use_cp:
@@ -318,12 +320,14 @@ def _forward_impl(
                 is_causal = True
             else:
                 causal_mask = torch.zeros(
-                    seq_len_q, seq_len_kv,
-                    dtype=query.dtype, device=query.device
+                    seq_len_q, seq_len_kv, dtype=query.dtype, device=query.device
                 )
                 causal_mask.masked_fill_(
-                    torch.triu(torch.ones(seq_len_q, seq_len_kv, device=query.device, dtype=torch.bool), diagonal=1),
-                    float('-inf')
+                    torch.triu(
+                        torch.ones(seq_len_q, seq_len_kv, device=query.device, dtype=torch.bool),
+                        diagonal=1,
+                    ),
+                    float("-inf"),
                 )
 
                 if attn_mask is not None:
@@ -350,7 +354,11 @@ def _forward_impl(
                 )
 
             if attn_mask is not None:
-                attn_mask = attn_mask + window_mask.unsqueeze(0) if window_mask.dim() == 2 else attn_mask + window_mask
+                attn_mask = (
+                    attn_mask + window_mask.unsqueeze(0)
+                    if window_mask.dim() == 2
+                    else attn_mask + window_mask
+                )
             else:
                 attn_mask = window_mask
 
@@ -362,7 +370,7 @@ def _forward_impl(
 
             if explicit_mask.dtype == torch.bool:
                 float_mask = torch.zeros_like(explicit_mask, dtype=query.dtype)
-                float_mask.masked_fill_(~explicit_mask, float('-inf'))
+                float_mask.masked_fill_(~explicit_mask, float("-inf"))
                 explicit_mask = float_mask
 
             if explicit_mask.dim() == 2:
diff --git a/transformer_engine/plugin/core/backends/reference/impl/__init__.py b/transformer_engine/plugin/core/backends/reference/impl/__init__.py
index 43d73e95c5..f467767d61 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/__init__.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/__init__.py
@@ -8,14 +8,33 @@
 from .normalization import layernorm_fwd_torch, layernorm_bwd_torch
 
 from .activation import (
-    gelu_torch, geglu_torch, qgelu_torch, qgeglu_torch,
-    relu_torch, reglu_torch, srelu_torch, sreglu_torch,
-    silu_torch, swiglu_torch, clamped_swiglu_torch,
-    dgelu_torch, dgeglu_torch, dqgelu_torch, dqgeglu_torch,
-    drelu_torch, dreglu_torch, dsrelu_torch, dsreglu_torch,
-    dsilu_torch, dswiglu_torch, clamped_dswiglu_torch,
-    dbias_dgelu_torch, dbias_dsilu_torch, dbias_drelu_torch,
-    dbias_dqgelu_torch, dbias_dsrelu_torch,
+    gelu_torch,
+    geglu_torch,
+    qgelu_torch,
+    qgeglu_torch,
+    relu_torch,
+    reglu_torch,
+    srelu_torch,
+    sreglu_torch,
+    silu_torch,
+    swiglu_torch,
+    clamped_swiglu_torch,
+    dgelu_torch,
+    dgeglu_torch,
+    dqgelu_torch,
+    dqgeglu_torch,
+    drelu_torch,
+    dreglu_torch,
+    dsrelu_torch,
+    dsreglu_torch,
+    dsilu_torch,
+    dswiglu_torch,
+    clamped_dswiglu_torch,
+    dbias_dgelu_torch,
+    dbias_dsilu_torch,
+    dbias_drelu_torch,
+    dbias_dqgelu_torch,
+    dbias_dsrelu_torch,
 )
 
 from .softmax import (
diff --git a/transformer_engine/plugin/core/backends/reference/impl/activation.py b/transformer_engine/plugin/core/backends/reference/impl/activation.py
index 8c9eb58a31..919c3718cb 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/activation.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/activation.py
@@ -38,12 +38,12 @@
 
 
 def gelu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
-    return F.gelu(input, approximate='tanh')
+    return F.gelu(input, approximate="tanh")
 
 
 def geglu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
     a, b = input.chunk(2, dim=-1)
-    return F.gelu(a, approximate='tanh') * b
+    return F.gelu(a, approximate="tanh") * b
 
 
 def qgelu_torch(input: torch.Tensor, quantizer: Any) -> torch.Tensor:
@@ -106,7 +106,7 @@ def clamped_swiglu_torch(
 def dgelu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> torch.Tensor:
     x = fwd_input.detach().requires_grad_(True)
     with torch.enable_grad():
-        y = F.gelu(x, approximate='tanh')
+        y = F.gelu(x, approximate="tanh")
         y.backward(grad)
     return x.grad
 
@@ -117,7 +117,7 @@ def dgeglu_torch(grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) ->
     b = b.detach().requires_grad_(True)
 
     with torch.enable_grad():
-        y = F.gelu(a, approximate='tanh') * b
+        y = F.gelu(a, approximate="tanh") * b
         y.backward(grad)
 
     return torch.cat([a.grad, b.grad], dim=-1)
diff --git a/transformer_engine/plugin/core/backends/reference/impl/dropout.py b/transformer_engine/plugin/core/backends/reference/impl/dropout.py
index 1acea164d8..f671ff6c5d 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/dropout.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/dropout.py
@@ -22,9 +22,7 @@ def dropout_fwd_torch(
         mask = torch.ones_like(input, dtype=torch.uint8)
         return output, mask
 
-    mask = torch.bernoulli(
-        torch.full_like(input, 1.0 - dropout_probability)
-    ).to(torch.uint8)
+    mask = torch.bernoulli(torch.full_like(input, 1.0 - dropout_probability)).to(torch.uint8)
 
     scale = 1.0 / (1.0 - dropout_probability)
     output = input * mask.to(input.dtype) * scale
diff --git a/transformer_engine/plugin/core/backends/reference/impl/gemm.py b/transformer_engine/plugin/core/backends/reference/impl/gemm.py
index ab4540162b..65a3f1cc52 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/gemm.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/gemm.py
@@ -27,7 +27,7 @@ def _convert_dtype(dtype: Union[int, torch.dtype, None]) -> Optional[torch.dtype
         return dtype
     if isinstance(dtype, int):
         return _DTYPE_TO_TORCH.get(dtype, None)
-    if hasattr(dtype, 'value'):
+    if hasattr(dtype, "value"):
         return _DTYPE_TO_TORCH.get(dtype.value, None)
     return None
 
@@ -102,7 +102,7 @@ def general_gemm_torch(
             gelu_input_ret = gelu_in
         else:
             gelu_input_ret = out.clone()
-        out = F.gelu(out, approximate='tanh')
+        out = F.gelu(out, approximate="tanh")
 
     torch_out_dtype = _convert_dtype(output_dtype)
     if torch_out_dtype is not None and out.dtype != torch_out_dtype:
diff --git a/transformer_engine/plugin/core/backends/reference/impl/normalization.py b/transformer_engine/plugin/core/backends/reference/impl/normalization.py
index 48f89b44d8..c9ca2e1ae3 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/normalization.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/normalization.py
@@ -25,6 +25,7 @@
     DType.kFloat8E5M2: torch.float8_e5m2,
 }
 
+
 def _to_torch_dtype(dtype):
     """Convert DType enum to torch.dtype."""
     if dtype is None:
@@ -37,6 +38,7 @@ def _to_torch_dtype(dtype):
             return _DTYPE_TO_TORCH_DTYPE[dtype_enum]
     raise ValueError(f"Unsupported dtype: {dtype}")
 
+
 def layernorm_fwd_torch(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -71,6 +73,7 @@ def layernorm_fwd_torch(
 
     return output, mean, rsigma
 
+
 def layernorm_bwd_torch(
     dy: torch.Tensor,
     x: torch.Tensor,
diff --git a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
index f3140a5695..ceac199837 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
@@ -88,8 +88,8 @@ def multi_tensor_adam_torch(
         raise ValueError("All tensor lists must have the same length")
 
     if bias_correction:
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
     else:
         bias_correction1 = 1.0
         bias_correction2 = 1.0
@@ -154,12 +154,14 @@ def multi_tensor_adam_param_remainder_torch(
 
     grads, params, exp_avgs, exp_avg_sqs, param_remainders = tensor_lists
 
-    if not (len(params) == len(grads) == len(exp_avgs) == len(exp_avg_sqs) == len(param_remainders)):
+    if not (
+        len(params) == len(grads) == len(exp_avgs) == len(exp_avg_sqs) == len(param_remainders)
+    ):
         raise ValueError("All tensor lists must have the same length")
 
     if bias_correction:
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
     else:
         bias_correction1 = 1.0
         bias_correction2 = 1.0
@@ -181,7 +183,7 @@ def multi_tensor_adam_param_remainder_torch(
         # We need to scale it back to the proper magnitude
         # BF16 has 16 bits total (1 sign, 8 exponent, 7 mantissa)
         # The remainder compensates for the lost precision
-        param_master = param_fp32 + param_remainder.float() * (2.0 ** -16)
+        param_master = param_fp32 + param_remainder.float() * (2.0**-16)
 
         # Standard Adam update on FP32 master weight
         if mode == 0:  # L2 regularization
@@ -213,7 +215,7 @@ def multi_tensor_adam_param_remainder_torch(
 
         # Compute remainder: difference between FP32 master and BF16 representation
         # Scale and quantize to int16 range
-        remainder_fp32 = (param_master - param_bf16.float()) * (2.0 ** 16)
+        remainder_fp32 = (param_master - param_bf16.float()) * (2.0**16)
         remainder_int16 = remainder_fp32.round().clamp(-32768, 32767).to(dtype=torch.int16)
 
         # Write back
@@ -310,4 +312,4 @@ def multi_tensor_compute_scale_and_scale_inv_torch(
 
         # Update scale and scale_inv
         scale.copy_(computed_scale)
-        scale_inv.copy_(1.0 / computed_scale)
\ No newline at end of file
+        scale_inv.copy_(1.0 / computed_scale)
diff --git a/transformer_engine/plugin/core/backends/reference/impl/softmax.py b/transformer_engine/plugin/core/backends/reference/impl/softmax.py
index 0b1c6ef4f0..1783ada92b 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/softmax.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/softmax.py
@@ -84,8 +84,8 @@ def scaled_upper_triang_masked_softmax_forward_torch(
     seq_len = input.size(-1)
 
     causal_mask = torch.triu(
-        torch.full((seq_len, seq_len), float('-inf'), device=input.device, dtype=input.dtype),
-        diagonal=1
+        torch.full((seq_len, seq_len), float("-inf"), device=input.device, dtype=input.dtype),
+        diagonal=1,
     )
 
     scaled_input = input * scale + causal_mask
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
index 80c7b327f0..984d62022f 100644
--- a/transformer_engine/plugin/core/backends/reference/reference.py
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -9,25 +9,51 @@
 
 from .impl import (
     general_gemm_torch,
-    rmsnorm_fwd_torch, rmsnorm_bwd_torch,
-    layernorm_fwd_torch, layernorm_bwd_torch,
-    gelu_torch, geglu_torch, qgelu_torch, qgeglu_torch,
-    relu_torch, reglu_torch, srelu_torch, sreglu_torch,
-    silu_torch, swiglu_torch, clamped_swiglu_torch,
-    dgelu_torch, dgeglu_torch, dqgelu_torch, dqgeglu_torch,
-    drelu_torch, dreglu_torch, dsrelu_torch, dsreglu_torch,
-    dsilu_torch, dswiglu_torch, clamped_dswiglu_torch,
-    dbias_dgelu_torch, dbias_dsilu_torch, dbias_drelu_torch,
-    dbias_dqgelu_torch, dbias_dsrelu_torch,
-    scaled_softmax_forward_torch, scaled_softmax_backward_torch,
-    scaled_masked_softmax_forward_torch, scaled_masked_softmax_backward_torch,
+    rmsnorm_fwd_torch,
+    rmsnorm_bwd_torch,
+    layernorm_fwd_torch,
+    layernorm_bwd_torch,
+    gelu_torch,
+    geglu_torch,
+    qgelu_torch,
+    qgeglu_torch,
+    relu_torch,
+    reglu_torch,
+    srelu_torch,
+    sreglu_torch,
+    silu_torch,
+    swiglu_torch,
+    clamped_swiglu_torch,
+    dgelu_torch,
+    dgeglu_torch,
+    dqgelu_torch,
+    dqgeglu_torch,
+    drelu_torch,
+    dreglu_torch,
+    dsrelu_torch,
+    dsreglu_torch,
+    dsilu_torch,
+    dswiglu_torch,
+    clamped_dswiglu_torch,
+    dbias_dgelu_torch,
+    dbias_dsilu_torch,
+    dbias_drelu_torch,
+    dbias_dqgelu_torch,
+    dbias_dsrelu_torch,
+    scaled_softmax_forward_torch,
+    scaled_softmax_backward_torch,
+    scaled_masked_softmax_forward_torch,
+    scaled_masked_softmax_backward_torch,
     scaled_upper_triang_masked_softmax_forward_torch,
     scaled_upper_triang_masked_softmax_backward_torch,
     scaled_aligned_causal_masked_softmax_forward_torch,
     scaled_aligned_causal_masked_softmax_backward_torch,
-    dropout_fwd_torch, dropout_bwd_torch,
-    multi_tensor_scale_torch, multi_tensor_l2norm_torch,
-    multi_tensor_adam_torch, multi_tensor_adam_param_remainder_torch,
+    dropout_fwd_torch,
+    dropout_bwd_torch,
+    multi_tensor_scale_torch,
+    multi_tensor_l2norm_torch,
+    multi_tensor_adam_torch,
+    multi_tensor_adam_param_remainder_torch,
     multi_tensor_sgd_torch,
 )
 
@@ -43,6 +69,7 @@ def is_available(self) -> bool:
     def get_attention_backend(self, _attention_params=None):
         from packaging.version import Version as PkgVersion
         from ...logger_manager import get_logger
+
         logger = get_logger()
 
         # Read environment variables to determine which backends to enable
@@ -98,10 +125,28 @@ def generic_gemm(
         beta: Optional[float] = None,
     ) -> List[Any]:
         return general_gemm_torch(
-            A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
-            accumulate, use_split_accumulator, comm_overlap, comm_type,
-            extra_output, bulk_overlap, alpha, beta
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
         )
 
     # GELU and variants
@@ -361,7 +406,9 @@ def scaled_upper_triang_masked_softmax_backward(
         softmax_results_: torch.Tensor,
         scale_factor: float,
     ) -> torch.Tensor:
-        return scaled_upper_triang_masked_softmax_backward_torch(output_grads_, softmax_results_, scale_factor)
+        return scaled_upper_triang_masked_softmax_backward_torch(
+            output_grads_, softmax_results_, scale_factor
+        )
 
     def scaled_aligned_causal_masked_softmax_forward(
         self,
@@ -376,7 +423,9 @@ def scaled_aligned_causal_masked_softmax_backward(
         softmax_results_: torch.Tensor,
         scale_factor: float,
     ) -> torch.Tensor:
-        return scaled_aligned_causal_masked_softmax_backward_torch(output_grad_, softmax_results_, scale_factor)
+        return scaled_aligned_causal_masked_softmax_backward_torch(
+            output_grad_, softmax_results_, scale_factor
+        )
 
     # Fused attention backend
     def get_fused_attn_backend(
@@ -457,7 +506,7 @@ def multi_tensor_unscale_l2norm(
         per_tensor: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if noop_flag.item() != 0:
-            device = tensor_lists[0][0].device if tensor_lists and tensor_lists[0] else 'cpu'
+            device = tensor_lists[0][0].device if tensor_lists and tensor_lists[0] else "cpu"
             return torch.tensor(0.0, device=device), torch.tensor(0.0, device=device)
 
         # Multiply by inv_scale
@@ -482,8 +531,17 @@ def multi_tensor_adam(
         weight_decay: float,
     ) -> None:
         return multi_tensor_adam_torch(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon, step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
 
     def multi_tensor_adam_param_remainder(
@@ -501,8 +559,17 @@ def multi_tensor_adam_param_remainder(
         weight_decay: float,
     ) -> None:
         return multi_tensor_adam_param_remainder_torch(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon, step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
 
     def multi_tensor_sgd(
@@ -520,10 +587,20 @@ def multi_tensor_sgd(
         scale: float,
     ) -> None:
         return multi_tensor_sgd_torch(
-            chunk_size, noop_flag, tensor_lists,
-            wd, momentum, dampening, lr, nesterov, first_run, wd_after_momentum, scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            wd,
+            momentum,
+            dampening,
+            lr,
+            nesterov,
+            first_run,
+            wd_after_momentum,
+            scale,
         )
 
     def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionTorch
+
         return FlashAttentionTorch
diff --git a/transformer_engine/plugin/core/backends/reference/register_ops.py b/transformer_engine/plugin/core/backends/reference/register_ops.py
index 9ecbf10974..0151ec00f9 100644
--- a/transformer_engine/plugin/core/backends/reference/register_ops.py
+++ b/transformer_engine/plugin/core/backends/reference/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -41,82 +43,449 @@ def register_builtins(registry) -> None:
 
     impls = [
         # Normalization
-        OpImpl(op_name="rmsnorm_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="rmsnorm_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="layernorm_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="layernorm_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="layernorm_fwd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.layernorm_fwd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="layernorm_bwd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.layernorm_bwd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # GEMM
-        OpImpl(op_name="generic_gemm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Activations - Forward
-        OpImpl(op_name="gelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.gelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="geglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.geglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="qgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.qgelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="qgeglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.qgeglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="relu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.relu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="reglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.reglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="srelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.srelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="sreglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.sreglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="silu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.silu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="swiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.swiglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="clamped_swiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="gelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.gelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="geglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.geglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="qgelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.qgelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="qgeglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.qgeglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="relu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.relu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="reglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.reglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="srelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.srelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="sreglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.sreglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="silu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.silu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="swiglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.swiglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="clamped_swiglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.clamped_swiglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Activations - Backward
-        OpImpl(op_name="dgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dgelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dgeglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dgeglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dqgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dqgelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dqgeglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="drelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.drelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dreglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dreglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dsrelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dsrelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dsreglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dsreglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dsilu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dsilu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dswiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dswiglu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="clamped_dswiglu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="dgelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dgelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dgeglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dgeglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dqgelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dqgelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dqgeglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dqgeglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="drelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.drelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dreglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dreglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dsrelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dsrelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dsreglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dsreglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dsilu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dsilu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dswiglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dswiglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="clamped_dswiglu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.clamped_dswiglu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Activations - Bias + Backward
-        OpImpl(op_name="dbias_dgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dbias_dsilu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dbias_drelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dbias_dqgelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dbias_dsrelu", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="dbias_dgelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dbias_dgelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dbias_dsilu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dbias_dsilu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dbias_drelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dbias_drelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dbias_dqgelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dbias_dqgelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dbias_dsrelu",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dbias_dsrelu, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Softmax
-        OpImpl(op_name="scaled_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="scaled_softmax_forward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_softmax_forward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_softmax_backward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_softmax_backward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_forward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_backward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_forward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_backward",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Fused attention backend getter
-        OpImpl(op_name="get_fused_attn_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="get_fused_attn_backend",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.get_fused_attn_backend, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Dropout
-        OpImpl(op_name="dropout_fwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="dropout_bwd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="dropout_fwd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dropout_fwd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="dropout_bwd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.dropout_bwd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Library version getters
-        OpImpl(op_name="get_cublasLt_version", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="get_cudnn_version", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="get_num_cublas_streams", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="get_cublasLt_version",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.get_cublasLt_version, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Multi-tensor optimizer operations
-        OpImpl(op_name="multi_tensor_scale", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_l2norm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_adam", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor=None, priority=50),
-        OpImpl(op_name="multi_tensor_sgd", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="multi_tensor_unscale_l2norm",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor=None,
+            priority=50,
+        ),
+        OpImpl(
+            op_name="multi_tensor_sgd",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.multi_tensor_sgd, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor=None, priority=50),
-
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor=None,
+            priority=50,
+        ),
         # Attention backend selection
-        OpImpl(op_name="get_attention_backend", impl_id="reference.torch", kind=BackendImplKind.REFERENCE, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor=None, priority=50),
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="reference.torch",
+            kind=BackendImplKind.REFERENCE,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor=None,
+            priority=50,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/__init__.py b/transformer_engine/plugin/core/backends/vendor/__init__.py
index ce8eb210bb..f94a17b393 100644
--- a/transformer_engine/plugin/core/backends/vendor/__init__.py
+++ b/transformer_engine/plugin/core/backends/vendor/__init__.py
@@ -37,6 +37,7 @@
         _vendor_loading_errors.append(("cuda", type(e).__name__, str(e)))
         print(f"Error loading CUDA vendor backend: {type(e).__name__}: {e}")
         import traceback
+
         traceback.print_exc()
 else:
     print("CUDA vendor backend skipped (CUDA build was disabled at build time)")
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py b/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py
index 04b5335bea..8b8b610b6b 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/__init__.py
@@ -4,4 +4,4 @@
 
 from .cuda import CUDABackend
 
-__all__ = ["CUDABackend"]
\ No newline at end of file
+__all__ = ["CUDABackend"]
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 8be7dd5052..fc1f008f23 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -7,6 +7,7 @@
 import torch
 from ....ops import *
 
+
 def _load_cuda_libs():
     import ctypes
     import os
@@ -47,7 +48,7 @@ def try_load_lib(name, search_patterns):
 
         try:
             result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
-            for line in result.decode().split('\n'):
+            for line in result.decode().split("\n"):
                 if f"lib{name}" in line and "=>" in line:
                     so_path = line.split(">")[1].strip()
                     if so_path:
@@ -65,7 +66,11 @@ def try_load_lib(name, search_patterns):
         try_load_lib("nvrtc", [f"libnvrtc{ext}*"])
         try_load_lib("curand", [f"libcurand{ext}*"])
 
-        te_path = Path(importlib.util.find_spec("transformer_engine").origin).parent.parent
+        te_path_override = os.environ.get("TE_LIB_PATH")
+        if te_path_override:
+            te_path = Path(te_path_override)
+        else:
+            te_path = Path(importlib.util.find_spec("transformer_engine").origin).parent.parent
         for search_dir in [te_path, te_path / "transformer_engine"]:
             if search_dir.exists():
                 matches = list(search_dir.glob(f"libtransformer_engine{ext}*"))
@@ -77,21 +82,26 @@ def try_load_lib(name, search_patterns):
         print(f"[CUDA] Failed to load CUDA libs: {e}")
         return False
 
+
 _cuda_libs_loaded = False
 
+
 def _ensure_cuda_libs():
     global _cuda_libs_loaded
     if not _cuda_libs_loaded:
         _cuda_libs_loaded = _load_cuda_libs()
     return _cuda_libs_loaded
 
+
 def _check_cuda_available() -> bool:
     if not torch.cuda.is_available():
         return False
 
     import os
+
     try:
         from ...._build_config import SKIP_CUDA_BUILD
+
         if SKIP_CUDA_BUILD:
             print("[CUDA] Disabled: CUDA was skipped at build time")
             return False
@@ -104,16 +114,20 @@ def _check_cuda_available() -> bool:
         if not _ensure_cuda_libs():
             return False
         import transformer_engine_torch_nv
+
         return True
     except (ImportError, OSError) as e:
         print(f"[CUDA] Import failed: {e}")
         return False
 
+
 def _get_tex():
     _ensure_cuda_libs()
     import transformer_engine_torch_nv
+
     return transformer_engine_torch_nv
 
+
 class CUDABackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -140,9 +154,10 @@ def get_attention_backend(self, attention_params=None):
         """
         # Import the original get_attention_backend function
         from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
+
         return dpa_utils._original_get_attention_backend(attention_params)
 
-##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -196,49 +211,78 @@ def generic_gemm(
         beta: Optional[float] = None,
     ) -> List[Any]:
         tex = self._get_tex()
-        
+
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
         output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
-            A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
-            accumulate, use_split_accumulator, comm_overlap, comm_type,
-            extra_output, bulk_overlap, alpha, beta
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
         )
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
+
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
+
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
+
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
+
     # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
+
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
+
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
+
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
+
     # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
+
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
+
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -248,39 +292,50 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
+
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
+
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
+
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
+
     # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
+
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
+
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
+
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
+
     # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
+
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
+
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -291,23 +346,33 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
     # DBias + DAct fusions #
     def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
     def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
     def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dqgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dsrelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-    # Permutation functions                                                                                                                                                                                                                                                                                             
+
+    # Permutation functions
     def moe_permute_fwd(
         self,
         input: torch.Tensor,
@@ -319,7 +384,10 @@ def moe_permute_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+        return tex.moe_permute_fwd(
+            input, dtype, indices, num_out_tokens, workspace, max_expanded_token_num
+        )
+
     def moe_permute_bwd(
         self,
         input: torch.Tensor,
@@ -331,7 +399,8 @@ def moe_permute_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_permute_bwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_fwd(
         self,
         input: torch.Tensor,
@@ -343,7 +412,8 @@ def moe_unpermute_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_unpermute_fwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_bwd(
         self,
         input_bwd: torch.Tensor,
@@ -354,7 +424,8 @@ def moe_unpermute_bwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+        return tex.moe_unpermute_bwd(input_bwd, input_fwd, dtype, row_id_map, prob)
+
     # Softmax functions
     def scaled_softmax_forward(
         self,
@@ -363,6 +434,7 @@ def scaled_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_forward(input, scale)
+
     def scaled_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -371,6 +443,7 @@ def scaled_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -379,6 +452,7 @@ def scaled_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+
     def scaled_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -387,6 +461,7 @@ def scaled_masked_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_upper_triang_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -394,6 +469,7 @@ def scaled_upper_triang_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+
     def scaled_upper_triang_masked_softmax_backward(
         self,
         output_grads_: torch.Tensor,
@@ -404,6 +480,7 @@ def scaled_upper_triang_masked_softmax_backward(
         return tex.scaled_upper_triang_masked_softmax_backward(
             output_grads_, softmax_results_, scale_factor
         )
+
     def scaled_aligned_causal_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -411,6 +488,7 @@ def scaled_aligned_causal_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+
     def scaled_aligned_causal_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -421,6 +499,7 @@ def scaled_aligned_causal_masked_softmax_backward(
         return tex.scaled_aligned_causal_masked_softmax_backward(
             output_grad_, softmax_results_, scale_factor
         )
+
     # Other granular functions
     def layernorm_fwd(
         self,
@@ -439,6 +518,7 @@ def layernorm_fwd(
         return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def layernorm_bwd(
         self,
         dz: torch.Tensor,
@@ -450,9 +530,8 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.layernorm_bwd(
-            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
-        )
+        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_fwd(
         self,
         input: Any,
@@ -469,6 +548,7 @@ def rmsnorm_fwd(
         return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def rmsnorm_bwd(
         self,
         dz: torch.Tensor,
@@ -480,6 +560,7 @@ def rmsnorm_bwd(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_bwd_add(
         self,
         dz: torch.Tensor,
@@ -500,6 +581,7 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -508,6 +590,7 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
+
     def te_general_grouped_gemm(
         self,
         A: List[Any],
@@ -532,10 +615,25 @@ def te_general_grouped_gemm(
         D_type = tex.DType(int(D_type)) if D_type is not None else None
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         return tex.te_general_grouped_gemm(
-            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
-            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
-            accumulate, use_split_accumulator, math_sm_count
+            A,
+            transa,
+            B,
+            transb,
+            D,
+            D_type,
+            m_splits,
+            bias,
+            bias_type,
+            single_output,
+            pre_gelu_out,
+            grad,
+            workspace,
+            workspaceSizes,
+            accumulate,
+            use_split_accumulator,
+            math_sm_count,
         )
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -545,6 +643,7 @@ def fp8_transpose(
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
         return tex.fp8_transpose(input, dtype, out)
+
     def swap_first_dims(
         self,
         tensor: torch.Tensor,
@@ -552,6 +651,7 @@ def swap_first_dims(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -578,14 +678,31 @@ def get_fused_attn_backend(
         kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         result = tex.get_fused_attn_backend(
-            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
-            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
-            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
-            head_dim_v, window_size_left, window_size_right, return_max_logit
+            is_training,
+            q_dtype,
+            kv_dtype,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            p_dropout,
+            num_attn_heads,
+            num_gqa_groups,
+            max_seqlen_q,
+            max_seqlen_kv,
+            head_dim_qk,
+            head_dim_v,
+            window_size_left,
+            window_size_right,
+            return_max_logit,
         )
         return NVTE_Fused_Attn_Backend(result)
 
@@ -596,6 +713,7 @@ def compute_amax(
     ) -> None:
         tex = self._get_tex()
         return tex.compute_amax(input, amax)
+
     def fused_amax_and_scale_update_after_reduction(
         self,
         amax_reduction_buffer: torch.Tensor,
@@ -608,9 +726,9 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales,
-            amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
         )
+
     def fp8_block_scaling_compute_partial_amax(
         self,
         tensor: torch.Tensor,
@@ -624,6 +742,7 @@ def fp8_block_scaling_compute_partial_amax(
         return tex.fp8_block_scaling_compute_partial_amax(
             tensor, amax, h, w, start_offset, block_len
         )
+
     def fp8_block_scaling_partial_cast(
         self,
         inp: torch.Tensor,
@@ -640,6 +759,7 @@ def fp8_block_scaling_partial_cast(
         return tex.fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -648,9 +768,8 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_padding(
-            input, output, input_row_list, padded_input_row_list
-        )
+        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
+
     def fused_multi_row_unpadding(
         self,
         input: torch.Tensor,
@@ -659,9 +778,7 @@ def fused_multi_row_unpadding(
         unpadded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(
-            input, output, input_row_list, unpadded_input_row_list
-        )
+        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
 
     # attention kernels
     def fa_prepare_fwd(
@@ -670,6 +787,7 @@ def fa_prepare_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_fwd(qkvi)
+
     def fa_prepare_bwd(
         self,
         q: torch.Tensor,
@@ -678,6 +796,7 @@ def fa_prepare_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_bwd(q, k, v)
+
     def fused_attn_fwd(
         self,
         max_seqlen_q: int,
@@ -713,8 +832,12 @@ def fused_attn_fwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         return tex.fused_attn_fwd(
             max_seqlen_q,
@@ -744,8 +867,9 @@ def fused_attn_fwd(
             SoftmaxOffset,
             rng_gen,
             rng_elts_per_thread,
-            return_max_logit
+            return_max_logit,
         )
+
     def fused_attn_bwd(
         self,
         max_seqlen_q: int,
@@ -779,8 +903,12 @@ def fused_attn_bwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
         dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
 
         return tex.fused_attn_bwd(
@@ -809,8 +937,9 @@ def fused_attn_bwd(
             cu_seqlens_kv_padded,
             s_quantizer,
             dp_quantizer,
-            dqkv_quantizer
+            dqkv_quantizer,
         )
+
     def copy_to_kv_cache(
         self,
         new_k: torch.Tensor,
@@ -842,8 +971,9 @@ def copy_to_kv_cache(
             max_ctx_len,
             max_seq_len,
             max_pages_per_seq,
-            is_non_paged
+            is_non_paged,
         )
+
     def convert_thd_to_bshd(
         self,
         tensor: torch.Tensor,
@@ -853,6 +983,7 @@ def convert_thd_to_bshd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+
     def convert_bshd_to_thd(
         self,
         tensor: torch.Tensor,
@@ -877,9 +1008,9 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
@@ -893,9 +1024,9 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_qkv_rope_forward(
         self,
         qkv_input: torch.Tensor,
@@ -911,10 +1042,17 @@ def fused_qkv_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_forward(
-            qkv_input, q_freqs, k_freqs, start_positions,
-            qkv_split_arg_list, qkv_format, interleaved,
-            cp_size, cp_rank
+            qkv_input,
+            q_freqs,
+            k_freqs,
+            start_positions,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
+
     def fused_qkv_rope_backward(
         self,
         q_grad_out: torch.Tensor,
@@ -931,9 +1069,16 @@ def fused_qkv_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_backward(
-            q_grad_out, k_grad_out, v_grad_out,
-            q_freqs, k_freqs, qkv_split_arg_list,
-            qkv_format, interleaved, cp_size, cp_rank
+            q_grad_out,
+            k_grad_out,
+            v_grad_out,
+            q_freqs,
+            k_freqs,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
 
     # fused router
@@ -959,6 +1104,7 @@ def fused_topk_with_score_function_fwd(
             score_function,
             expert_bias,
         )
+
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -983,6 +1129,7 @@ def fused_topk_with_score_function_bwd(
             scaling_factor,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
@@ -995,6 +1142,7 @@ def fused_score_for_moe_aux_loss_fwd(
             topk,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -1013,6 +1161,7 @@ def fused_score_for_moe_aux_loss_bwd(
             topk,
             score_function,
         )
+
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -1035,6 +1184,7 @@ def fused_moe_aux_loss_fwd(
             topk,
             coeff,
         )
+
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -1044,7 +1194,9 @@ def fused_moe_aux_loss_bwd(
         grad_aux_loss: torch.Tensor,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
 
     # Dropout
     def dropout_fwd(
@@ -1055,6 +1207,7 @@ def dropout_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
+
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
@@ -1069,9 +1222,11 @@ def dropout_bwd(
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
+
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
+
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
@@ -1085,6 +1240,7 @@ def thd_read_half_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+
     def thd_second_half_lse_correction(
         self,
         lse: torch.Tensor,
@@ -1093,9 +1249,8 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(
-            lse, lse_per_step, cu_seqlens, lse_packed
-        )
+        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
+
     def thd_read_second_half_lse(
         self,
         lse: torch.Tensor,
@@ -1104,9 +1259,8 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(
-            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
-        )
+        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
+
     def thd_out_correction(
         self,
         out: torch.Tensor,
@@ -1119,9 +1273,9 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step,
-            cu_seqlens, only_second_half, lse_packed
+            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
         )
+
     def thd_grad_correction(
         self,
         grad: torch.Tensor,
@@ -1131,10 +1285,8 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(
-            grad, grad_per_step, cu_seqlens,
-            first_half, second_half
-        )
+        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
+
     def thd_get_partitioned_indices(
         self,
         cu_seqlens: torch.Tensor,
@@ -1143,9 +1295,7 @@ def thd_get_partitioned_indices(
         rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(
-            cu_seqlens, total_tokens, world_size, rank
-        )
+        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
 
     # nvshmem functions
     def init_nvshmem_backend(
@@ -1154,6 +1304,7 @@ def init_nvshmem_backend(
     ) -> None:
         tex = self._get_tex()
         return tex.init_nvshmem_backend(process_group)
+
     def create_nvshmem_tensor(
         self,
         shape: List[int],
@@ -1161,6 +1312,7 @@ def create_nvshmem_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.create_nvshmem_tensor(shape, dtype)
+
     def nvshmem_send_on_current_stream(
         self,
         src: torch.Tensor,
@@ -1170,6 +1322,7 @@ def nvshmem_send_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+
     def nvshmem_wait_on_current_stream(
         self,
         signal: torch.Tensor,
@@ -1177,6 +1330,7 @@ def nvshmem_wait_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
+
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
         return tex.nvshmem_finalize()
@@ -1191,6 +1345,7 @@ def multi_tensor_scale(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1200,6 +1355,7 @@ def multi_tensor_l2norm(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
@@ -1212,6 +1368,7 @@ def multi_tensor_unscale_l2norm(
         return tex.multi_tensor_unscale_l2norm(
             chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
         )
+
     def multi_tensor_adam(
         self,
         chunk_size: int,
@@ -1228,10 +1385,19 @@ def multi_tensor_adam(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_param_remainder(
         self,
         chunk_size: int,
@@ -1248,10 +1414,19 @@ def multi_tensor_adam_param_remainder(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_param_remainder(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_fp8(
         self,
         chunk_size: int,
@@ -1270,11 +1445,20 @@ def multi_tensor_adam_fp8(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.multi_tensor_adam_fp8(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            fp8_dtype
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            fp8_dtype,
         )
+
     def multi_tensor_adam_capturable(
         self,
         chunk_size: int,
@@ -1292,11 +1476,20 @@ def multi_tensor_adam_capturable(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_adam_capturable_master(
         self,
         chunk_size: int,
@@ -1314,11 +1507,20 @@ def multi_tensor_adam_capturable_master(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable_master(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_sgd(
         self,
         chunk_size: int,
@@ -1335,11 +1537,19 @@ def multi_tensor_sgd(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_sgd(
-            chunk_size, noop_flag, tensor_lists,
-            wd, momentum, dampening,
-            lr, nesterov, first_run,
-            wd_after_momentum, scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            wd,
+            momentum,
+            dampening,
+            lr,
+            nesterov,
+            first_run,
+            wd_after_momentum,
+            scale,
         )
+
     def multi_tensor_compute_scale_and_scale_inv(
         self,
         chunk_size: int,
@@ -1351,8 +1561,7 @@ def multi_tensor_compute_scale_and_scale_inv(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_compute_scale_and_scale_inv(
-            chunk_size, noop_flag, tensor_lists,
-            max_fp8, force_pow_2_scales, epsilon
+            chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
     # Comm+GEMM Overlap
@@ -1363,15 +1572,20 @@ def bulk_overlap_ag_with_external_gemm(
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
-        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+        return tex.bulk_overlap_ag_with_external_gemm(
+            allgather_communicator, send_stream, recv_stream
+        )
 
-############## class func #################################
+    ############## class func #################################
     def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionCUDA
+
         return FlashAttentionCUDA
+
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
+
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
@@ -1379,6 +1593,7 @@ def create_comm_overlap_helper(
     ) -> "CommOverlapHelper":
         tex = self._get_tex()
         return tex.CommOverlapHelper(world_group, intra_node_group)
+
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1397,11 +1612,21 @@ def create_comm_overlap(
     ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
-            buffer_shape, buffer_dtype, helper, tp_size,
-            num_splits, num_max_streams, comm_cga_size,
-            gemm_priority, comm_priority, num_comm_sm,
-            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            num_splits,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            rs_overlap_first_gemm,
         )
+
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1421,7 +1646,18 @@ def create_comm_overlap_p2p(
     ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
-            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
-            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
-            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            comm_type,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            use_ce,
+            aggregate,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
index 95b0aca37c..4137ce1b4c 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
@@ -31,12 +31,12 @@ def __init__(
 
         # Store initialization parameters for lazy loading
         self._init_params = {
-            'softmax_scale': softmax_scale,
-            'attention_dropout': attention_dropout,
-            'attention_dropout_ctx': attention_dropout_ctx or nullcontext,
-            'attention_type': attention_type,
-            'layer_number': layer_number,
-            'deterministic': deterministic,
+            "softmax_scale": softmax_scale,
+            "attention_dropout": attention_dropout,
+            "attention_dropout_ctx": attention_dropout_ctx or nullcontext,
+            "attention_type": attention_type,
+            "layer_number": layer_number,
+            "deterministic": deterministic,
         }
         self._native_flash_attn = None
 
@@ -53,7 +53,9 @@ def _ensure_native_flash_attn(self):
             )
 
             if FlashAttentionNative is None:
-                raise RuntimeError("FlashAttention class is None - flash-attn may not be installed correctly")
+                raise RuntimeError(
+                    "FlashAttention class is None - flash-attn may not be installed correctly"
+                )
 
             self._native_flash_attn = FlashAttentionNative(**self._init_params)
 
@@ -64,8 +66,7 @@ def _ensure_native_flash_attn(self):
             )
         except Exception as e:
             raise RuntimeError(
-                f"Failed to initialize native FlashAttention: {e}. "
-                f"Init params: {self._init_params}"
+                f"Failed to initialize native FlashAttention: {e}. Init params: {self._init_params}"
             )
 
     @property
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
index 3beff6331c..ca65c0d384 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -46,160 +48,908 @@ def register_builtins(registry) -> None:
 
     impls = [
         # Normalization
-        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="layernorm_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="layernorm_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd_add",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # GEMM
-        OpImpl(op_name="generic_gemm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Quantization
-        OpImpl(op_name="quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dequantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="bgrad_quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="split_quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="quantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.quantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dequantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dequantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_quantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_quantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="split_quantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.split_quantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Activations - Forward
-        OpImpl(op_name="gelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="geglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="qgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="qgeglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="relu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="reglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="srelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="sreglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="silu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="swiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="clamped_swiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="gelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.gelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="geglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.geglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgeglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgeglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="relu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.relu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="reglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.reglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="srelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.srelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="sreglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.sreglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="silu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.silu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swiglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swiglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_swiglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_swiglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Activations - Backward
-        OpImpl(op_name="dgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dgeglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dqgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dqgeglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="drelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dreglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dsrelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dsreglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dsilu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dswiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="dgelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dgeglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgeglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgeglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgeglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="drelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.drelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dreglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dreglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsrelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsrelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsreglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsreglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsilu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsilu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dswiglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dswiglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_dswiglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_dswiglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Activations - Bias + Backward
-        OpImpl(op_name="dbias_dgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dbias_dsilu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dbias_drelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="dbias_dgelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dgelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsilu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsilu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_drelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_drelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dqgelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dqgelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsrelu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsrelu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Softmax
-        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="scaled_softmax_forward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_forward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_softmax_backward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_backward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_forward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_backward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_forward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_backward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # MOE operations
-        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="moe_permute_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_permute_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Fused attention
-        OpImpl(op_name="get_fused_attn_backend", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_attn_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_attn_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="get_fused_attn_backend",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_fused_attn_backend, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # KV cache
-        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="copy_to_kv_cache",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.copy_to_kv_cache, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Tensor format conversions
-        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="convert_thd_to_bshd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_bshd_to_thd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # RoPE (Rotary Position Embedding)
-        OpImpl(op_name="fused_rope_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_rope_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="fused_rope_forward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_forward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_rope_backward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_backward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_forward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_backward",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # TopK and MOE aux loss
-        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="fused_topk_with_score_function_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_topk_with_score_function_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Dropout
-        OpImpl(op_name="dropout_fwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="dropout_bwd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="dropout_fwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_fwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dropout_bwd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_bwd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # FP8 operations
-        OpImpl(op_name="fp8_transpose", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="swap_first_dims", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="compute_amax", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="fp8_transpose",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_transpose, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swap_first_dims",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swap_first_dims, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="compute_amax",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.compute_amax, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_amax_and_scale_update_after_reduction",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_compute_partial_amax",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_partial_cast",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Padding operations
-        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="fused_multi_row_padding",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_padding, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_multi_row_unpadding",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Library version getters
-        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="get_cudnn_version", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="get_cublasLt_version",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cublasLt_version, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # THD (Tensor, Hidden, Dimension) operations
-        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="thd_out_correction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="thd_grad_correction", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="thd_read_half_tensor",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_half_tensor, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_second_half_lse_correction",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_read_second_half_lse",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_out_correction",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_out_correction, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_grad_correction",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_grad_correction, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_get_partitioned_indices",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # NVSHMEM operations
-        OpImpl(op_name="init_nvshmem_backend", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="create_nvshmem_tensor", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="nvshmem_finalize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="init_nvshmem_backend",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.init_nvshmem_backend, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_nvshmem_tensor",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_send_on_current_stream",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_wait_on_current_stream",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_finalize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_finalize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Multi-tensor operations
-        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="multi_tensor_quantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_quantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_unscale_l2norm",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_fp8",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable_master",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_sgd",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_sgd, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_and_scale_inv",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Communication overlap operations
-        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="create_comm_overlap", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="CUDA", priority=100),
-        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="bulk_overlap_ag_with_external_gemm",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_fp8_tensor_meta",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_helper",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_p2p",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="CUDA", priority=100),
-
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Attention backend selection
-        OpImpl(op_name="get_attention_backend", impl_id="vendor.cuda", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="CUDA", priority=100),
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py b/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py
index 331c70c649..a48a5c650f 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/__init__.py
@@ -4,4 +4,4 @@
 
 from .hygon import HygonBackend
 
-__all__ = ["HygonBackend"]
\ No newline at end of file
+__all__ = ["HygonBackend"]
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
index 831a83181c..cad4a13f35 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
@@ -9,6 +9,7 @@
 
 from transformer_engine.plugin.core.ops import FlashAttentionBase
 
+
 class FlashAttentionHYGON(FlashAttentionBase):
     def __init__(
         self,
@@ -30,12 +31,12 @@ def __init__(
 
         # Store initialization parameters for lazy loading
         self._init_params = {
-            'softmax_scale': softmax_scale,
-            'attention_dropout': attention_dropout,
-            'attention_dropout_ctx': attention_dropout_ctx or nullcontext,
-            'attention_type': attention_type,
-            'layer_number': layer_number,
-            'deterministic': deterministic,
+            "softmax_scale": softmax_scale,
+            "attention_dropout": attention_dropout,
+            "attention_dropout_ctx": attention_dropout_ctx or nullcontext,
+            "attention_type": attention_type,
+            "layer_number": layer_number,
+            "deterministic": deterministic,
         }
         self._native_flash_attn = None
 
@@ -52,7 +53,9 @@ def _ensure_native_flash_attn(self):
             )
 
             if FlashAttentionNative is None:
-                raise RuntimeError("FlashAttention class is None - flash-attn may not be installed correctly")
+                raise RuntimeError(
+                    "FlashAttention class is None - flash-attn may not be installed correctly"
+                )
 
             self._native_flash_attn = FlashAttentionNative(**self._init_params)
 
@@ -63,8 +66,7 @@ def _ensure_native_flash_attn(self):
             )
         except Exception as e:
             raise RuntimeError(
-                f"Failed to initialize native FlashAttention: {e}. "
-                f"Init params: {self._init_params}"
+                f"Failed to initialize native FlashAttention: {e}. Init params: {self._init_params}"
             )
 
     @property
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index c87aef8430..2231ad59a4 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -8,15 +8,18 @@
 import torch
 from ....ops import *
 
+
 def _load_hygon_libs():
     import ctypes
     from pathlib import Path
     import importlib
     import platform
+
     common_prefix = "libtransformer_engine"
     csrc_prefix = "transformer_engine_torch_hygon"
     common_files = []
     csrc_files = []
+
     def _get_sys_extension() -> str:
         system = platform.system()
         if system == "Linux":
@@ -26,6 +29,7 @@ def _get_sys_extension() -> str:
         if system == "Windows":
             return ".dll"
         raise RuntimeError(f"Unsupported operating system ({system})")
+
     try:
         if bool(int(os.environ.get("TE_FL_SKIP_HYGON", "0"))):
             return False
@@ -53,29 +57,36 @@ def _get_sys_extension() -> str:
         print(f"[HYGON] Failed to load hygon libs: {e}")
         return False
 
+
 _hygon_libs_loaded = False
 
+
 def _ensure_hygon_libs():
     global _hygon_libs_loaded
     if not _hygon_libs_loaded:
         _hygon_libs_loaded = _load_hygon_libs()
     return _hygon_libs_loaded
 
+
 def _check_hygon_available() -> bool:
     try:
         if not _ensure_hygon_libs():
             return False
         import transformer_engine_torch_hygon
+
         return True
     except (ImportError, OSError) as e:
         print(f"[HYGON] Import failed: {e}")
         return False
 
+
 def _get_tex():
     _ensure_hygon_libs()
     import transformer_engine_torch_hygon
+
     return transformer_engine_torch_hygon
 
+
 class HygonBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -95,6 +106,7 @@ def is_available(self) -> bool:
     def get_attention_backend(self, attention_params=None):
         from packaging.version import Version as PkgVersion
         from ....logger_manager import get_logger
+
         logger = get_logger()
 
         # Read environment variables to determine which backends to enable
@@ -124,7 +136,7 @@ def get_attention_backend(self, attention_params=None):
             available_backends,
         )
 
-##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -178,49 +190,78 @@ def generic_gemm(
         beta: Optional[float] = None,
     ) -> List[Any]:
         tex = self._get_tex()
-        
+
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
         output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
-            A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
-            accumulate, use_split_accumulator, comm_overlap, comm_type,
-            extra_output, bulk_overlap, alpha, beta
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
         )
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
+
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
+
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
+
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
+
     # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
+
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
+
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
+
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
+
     # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
+
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
+
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -230,39 +271,50 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
+
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
+
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
+
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
+
     # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
+
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
+
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
+
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
+
     # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
+
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
+
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -273,23 +325,33 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
     # DBias + DAct fusions #
     def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
     def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
     def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dqgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dsrelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-    # Permutation functions                                                                                                                                                                                                                                                                                             
+
+    # Permutation functions
     def moe_permute_fwd(
         self,
         input: torch.Tensor,
@@ -301,7 +363,10 @@ def moe_permute_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+        return tex.moe_permute_fwd(
+            input, dtype, indices, num_out_tokens, workspace, max_expanded_token_num
+        )
+
     def moe_permute_bwd(
         self,
         input: torch.Tensor,
@@ -313,7 +378,8 @@ def moe_permute_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_permute_bwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_fwd(
         self,
         input: torch.Tensor,
@@ -325,7 +391,8 @@ def moe_unpermute_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_unpermute_fwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_bwd(
         self,
         input_bwd: torch.Tensor,
@@ -336,7 +403,8 @@ def moe_unpermute_bwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+        return tex.moe_unpermute_bwd(input_bwd, input_fwd, dtype, row_id_map, prob)
+
     # Softmax functions
     def scaled_softmax_forward(
         self,
@@ -345,6 +413,7 @@ def scaled_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_forward(input, scale)
+
     def scaled_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -353,6 +422,7 @@ def scaled_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -361,6 +431,7 @@ def scaled_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+
     def scaled_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -369,6 +440,7 @@ def scaled_masked_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_upper_triang_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -376,6 +448,7 @@ def scaled_upper_triang_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+
     def scaled_upper_triang_masked_softmax_backward(
         self,
         output_grads_: torch.Tensor,
@@ -386,6 +459,7 @@ def scaled_upper_triang_masked_softmax_backward(
         return tex.scaled_upper_triang_masked_softmax_backward(
             output_grads_, softmax_results_, scale_factor
         )
+
     def scaled_aligned_causal_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -393,6 +467,7 @@ def scaled_aligned_causal_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+
     def scaled_aligned_causal_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -403,6 +478,7 @@ def scaled_aligned_causal_masked_softmax_backward(
         return tex.scaled_aligned_causal_masked_softmax_backward(
             output_grad_, softmax_results_, scale_factor
         )
+
     # Other granular functions
     def layernorm_fwd(
         self,
@@ -421,6 +497,7 @@ def layernorm_fwd(
         return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def layernorm_bwd(
         self,
         dz: torch.Tensor,
@@ -432,9 +509,8 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.layernorm_bwd(
-            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
-        )
+        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_fwd(
         self,
         input: Any,
@@ -451,6 +527,7 @@ def rmsnorm_fwd(
         return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def rmsnorm_bwd(
         self,
         dz: torch.Tensor,
@@ -462,6 +539,7 @@ def rmsnorm_bwd(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_bwd_add(
         self,
         dz: torch.Tensor,
@@ -482,6 +560,7 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -490,6 +569,7 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
+
     def te_general_grouped_gemm(
         self,
         A: List[Any],
@@ -514,10 +594,25 @@ def te_general_grouped_gemm(
         D_type = tex.DType(int(D_type)) if D_type is not None else None
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         return tex.te_general_grouped_gemm(
-            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
-            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
-            accumulate, use_split_accumulator, math_sm_count
+            A,
+            transa,
+            B,
+            transb,
+            D,
+            D_type,
+            m_splits,
+            bias,
+            bias_type,
+            single_output,
+            pre_gelu_out,
+            grad,
+            workspace,
+            workspaceSizes,
+            accumulate,
+            use_split_accumulator,
+            math_sm_count,
         )
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -527,6 +622,7 @@ def fp8_transpose(
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
         return tex.fp8_transpose(input, dtype, out)
+
     def swap_first_dims(
         self,
         tensor: torch.Tensor,
@@ -534,6 +630,7 @@ def swap_first_dims(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -560,14 +657,31 @@ def get_fused_attn_backend(
         kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         result = tex.get_fused_attn_backend(
-            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
-            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
-            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
-            head_dim_v, window_size_left, window_size_right, return_max_logit
+            is_training,
+            q_dtype,
+            kv_dtype,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            p_dropout,
+            num_attn_heads,
+            num_gqa_groups,
+            max_seqlen_q,
+            max_seqlen_kv,
+            head_dim_qk,
+            head_dim_v,
+            window_size_left,
+            window_size_right,
+            return_max_logit,
         )
         return NVTE_Fused_Attn_Backend(result)
 
@@ -578,6 +692,7 @@ def compute_amax(
     ) -> None:
         tex = self._get_tex()
         return tex.compute_amax(input, amax)
+
     def fused_amax_and_scale_update_after_reduction(
         self,
         amax_reduction_buffer: torch.Tensor,
@@ -590,9 +705,9 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales,
-            amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
         )
+
     def fp8_block_scaling_compute_partial_amax(
         self,
         tensor: torch.Tensor,
@@ -606,6 +721,7 @@ def fp8_block_scaling_compute_partial_amax(
         return tex.fp8_block_scaling_compute_partial_amax(
             tensor, amax, h, w, start_offset, block_len
         )
+
     def fp8_block_scaling_partial_cast(
         self,
         inp: torch.Tensor,
@@ -622,6 +738,7 @@ def fp8_block_scaling_partial_cast(
         return tex.fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -630,9 +747,8 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_padding(
-            input, output, input_row_list, padded_input_row_list
-        )
+        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
+
     def fused_multi_row_unpadding(
         self,
         input: torch.Tensor,
@@ -641,9 +757,7 @@ def fused_multi_row_unpadding(
         unpadded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(
-            input, output, input_row_list, unpadded_input_row_list
-        )
+        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
 
     # attention kernels
     def fa_prepare_fwd(
@@ -652,6 +766,7 @@ def fa_prepare_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_fwd(qkvi)
+
     def fa_prepare_bwd(
         self,
         q: torch.Tensor,
@@ -660,6 +775,7 @@ def fa_prepare_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_bwd(q, k, v)
+
     def fused_attn_fwd(
         self,
         max_seqlen_q: int,
@@ -695,8 +811,12 @@ def fused_attn_fwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         return tex.fused_attn_fwd(
             max_seqlen_q,
@@ -726,8 +846,9 @@ def fused_attn_fwd(
             SoftmaxOffset,
             rng_gen,
             rng_elts_per_thread,
-            return_max_logit
+            return_max_logit,
         )
+
     def fused_attn_bwd(
         self,
         max_seqlen_q: int,
@@ -761,8 +882,12 @@ def fused_attn_bwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
         dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
 
         return tex.fused_attn_bwd(
@@ -791,8 +916,9 @@ def fused_attn_bwd(
             cu_seqlens_kv_padded,
             s_quantizer,
             dp_quantizer,
-            dqkv_quantizer
+            dqkv_quantizer,
         )
+
     def copy_to_kv_cache(
         self,
         new_k: torch.Tensor,
@@ -824,8 +950,9 @@ def copy_to_kv_cache(
             max_ctx_len,
             max_seq_len,
             max_pages_per_seq,
-            is_non_paged
+            is_non_paged,
         )
+
     def convert_thd_to_bshd(
         self,
         tensor: torch.Tensor,
@@ -835,6 +962,7 @@ def convert_thd_to_bshd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+
     def convert_bshd_to_thd(
         self,
         tensor: torch.Tensor,
@@ -859,9 +987,9 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
@@ -875,9 +1003,9 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_qkv_rope_forward(
         self,
         qkv_input: torch.Tensor,
@@ -893,10 +1021,17 @@ def fused_qkv_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_forward(
-            qkv_input, q_freqs, k_freqs, start_positions,
-            qkv_split_arg_list, qkv_format, interleaved,
-            cp_size, cp_rank
+            qkv_input,
+            q_freqs,
+            k_freqs,
+            start_positions,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
+
     def fused_qkv_rope_backward(
         self,
         q_grad_out: torch.Tensor,
@@ -913,9 +1048,16 @@ def fused_qkv_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_backward(
-            q_grad_out, k_grad_out, v_grad_out,
-            q_freqs, k_freqs, qkv_split_arg_list,
-            qkv_format, interleaved, cp_size, cp_rank
+            q_grad_out,
+            k_grad_out,
+            v_grad_out,
+            q_freqs,
+            k_freqs,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
 
     # fused router
@@ -941,6 +1083,7 @@ def fused_topk_with_score_function_fwd(
             score_function,
             expert_bias,
         )
+
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -965,6 +1108,7 @@ def fused_topk_with_score_function_bwd(
             scaling_factor,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
@@ -977,6 +1121,7 @@ def fused_score_for_moe_aux_loss_fwd(
             topk,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -995,6 +1140,7 @@ def fused_score_for_moe_aux_loss_bwd(
             topk,
             score_function,
         )
+
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -1017,6 +1163,7 @@ def fused_moe_aux_loss_fwd(
             topk,
             coeff,
         )
+
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -1026,7 +1173,9 @@ def fused_moe_aux_loss_bwd(
         grad_aux_loss: torch.Tensor,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
 
     # Dropout
     def dropout_fwd(
@@ -1037,6 +1186,7 @@ def dropout_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
+
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
@@ -1051,9 +1201,11 @@ def dropout_bwd(
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
+
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
+
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
@@ -1067,6 +1219,7 @@ def thd_read_half_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+
     def thd_second_half_lse_correction(
         self,
         lse: torch.Tensor,
@@ -1075,9 +1228,8 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(
-            lse, lse_per_step, cu_seqlens, lse_packed
-        )
+        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
+
     def thd_read_second_half_lse(
         self,
         lse: torch.Tensor,
@@ -1086,9 +1238,8 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(
-            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
-        )
+        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
+
     def thd_out_correction(
         self,
         out: torch.Tensor,
@@ -1101,9 +1252,9 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step,
-            cu_seqlens, only_second_half, lse_packed
+            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
         )
+
     def thd_grad_correction(
         self,
         grad: torch.Tensor,
@@ -1113,10 +1264,8 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(
-            grad, grad_per_step, cu_seqlens,
-            first_half, second_half
-        )
+        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
+
     def thd_get_partitioned_indices(
         self,
         cu_seqlens: torch.Tensor,
@@ -1125,9 +1274,7 @@ def thd_get_partitioned_indices(
         rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(
-            cu_seqlens, total_tokens, world_size, rank
-        )
+        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
 
     # nvshmem functions
     def init_nvshmem_backend(
@@ -1136,6 +1283,7 @@ def init_nvshmem_backend(
     ) -> None:
         tex = self._get_tex()
         return tex.init_nvshmem_backend(process_group)
+
     def create_nvshmem_tensor(
         self,
         shape: List[int],
@@ -1143,6 +1291,7 @@ def create_nvshmem_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.create_nvshmem_tensor(shape, dtype)
+
     def nvshmem_send_on_current_stream(
         self,
         src: torch.Tensor,
@@ -1152,6 +1301,7 @@ def nvshmem_send_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+
     def nvshmem_wait_on_current_stream(
         self,
         signal: torch.Tensor,
@@ -1159,6 +1309,7 @@ def nvshmem_wait_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
+
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
         return tex.nvshmem_finalize()
@@ -1173,6 +1324,7 @@ def multi_tensor_scale(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1182,6 +1334,7 @@ def multi_tensor_l2norm(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
@@ -1194,6 +1347,7 @@ def multi_tensor_unscale_l2norm(
         return tex.multi_tensor_unscale_l2norm(
             chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
         )
+
     def multi_tensor_adam(
         self,
         chunk_size: int,
@@ -1210,10 +1364,19 @@ def multi_tensor_adam(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_param_remainder(
         self,
         chunk_size: int,
@@ -1230,10 +1393,19 @@ def multi_tensor_adam_param_remainder(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_param_remainder(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_fp8(
         self,
         chunk_size: int,
@@ -1252,11 +1424,20 @@ def multi_tensor_adam_fp8(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.multi_tensor_adam_fp8(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            fp8_dtype
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            fp8_dtype,
         )
+
     def multi_tensor_adam_capturable(
         self,
         chunk_size: int,
@@ -1274,11 +1455,20 @@ def multi_tensor_adam_capturable(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_adam_capturable_master(
         self,
         chunk_size: int,
@@ -1296,11 +1486,20 @@ def multi_tensor_adam_capturable_master(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable_master(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_sgd(
         self,
         chunk_size: int,
@@ -1317,11 +1516,19 @@ def multi_tensor_sgd(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_sgd(
-            chunk_size, noop_flag, tensor_lists,
-            wd, momentum, dampening,
-            lr, nesterov, first_run,
-            wd_after_momentum, scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            wd,
+            momentum,
+            dampening,
+            lr,
+            nesterov,
+            first_run,
+            wd_after_momentum,
+            scale,
         )
+
     def multi_tensor_compute_scale_and_scale_inv(
         self,
         chunk_size: int,
@@ -1333,8 +1540,7 @@ def multi_tensor_compute_scale_and_scale_inv(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_compute_scale_and_scale_inv(
-            chunk_size, noop_flag, tensor_lists,
-            max_fp8, force_pow_2_scales, epsilon
+            chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
     # Comm+GEMM Overlap
@@ -1345,15 +1551,20 @@ def bulk_overlap_ag_with_external_gemm(
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
-        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+        return tex.bulk_overlap_ag_with_external_gemm(
+            allgather_communicator, send_stream, recv_stream
+        )
 
-############## class func #################################
+    ############## class func #################################
     def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionHYGON
+
         return FlashAttentionHYGON
+
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
+
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
@@ -1361,6 +1572,7 @@ def create_comm_overlap_helper(
     ) -> "CommOverlapHelper":
         tex = self._get_tex()
         return tex.CommOverlapHelper(world_group, intra_node_group)
+
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1379,11 +1591,21 @@ def create_comm_overlap(
     ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
-            buffer_shape, buffer_dtype, helper, tp_size,
-            num_splits, num_max_streams, comm_cga_size,
-            gemm_priority, comm_priority, num_comm_sm,
-            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            num_splits,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            rs_overlap_first_gemm,
         )
+
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1403,7 +1625,18 @@ def create_comm_overlap_p2p(
     ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
-            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
-            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
-            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            comm_type,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            use_ce,
+            aggregate,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
index 6000eff69c..8221285219 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -46,152 +48,844 @@ def register_builtins(registry) -> None:
 
     impls = [
         # Normalization
-        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="layernorm_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="layernorm_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd_add",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # GEMM
-        OpImpl(op_name="generic_gemm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Quantization
-        OpImpl(op_name="quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dequantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="bgrad_quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="split_quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="quantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.quantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dequantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dequantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_quantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_quantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="split_quantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.split_quantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Activations - Forward
-        OpImpl(op_name="gelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="geglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="qgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="qgeglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="relu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="reglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="srelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="sreglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="silu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="swiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="clamped_swiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="gelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.gelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="geglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.geglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgeglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgeglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="relu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.relu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="reglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.reglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="srelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.srelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="sreglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.sreglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="silu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.silu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swiglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swiglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_swiglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_swiglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Activations - Backward
-        OpImpl(op_name="dgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dgeglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dqgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dqgeglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="drelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dreglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dsrelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dsreglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dsilu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dswiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="dgelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dgeglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgeglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgeglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgeglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="drelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.drelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dreglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dreglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsrelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsrelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsreglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsreglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsilu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsilu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dswiglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dswiglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_dswiglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_dswiglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Activations - Bias + Backward
-        OpImpl(op_name="dbias_dgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dbias_dsilu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dbias_drelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="dbias_dgelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dgelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsilu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsilu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_drelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_drelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dqgelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dqgelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsrelu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsrelu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Softmax
-        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="scaled_softmax_forward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_forward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_softmax_backward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_backward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_forward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_backward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_forward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_backward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # MOE operations
-        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="moe_permute_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_permute_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Fused attention
-        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="fa_prepare_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # KV cache
-        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="copy_to_kv_cache",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.copy_to_kv_cache, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Tensor format conversions
-        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="convert_thd_to_bshd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_bshd_to_thd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # RoPE (Rotary Position Embedding)
-        OpImpl(op_name="fused_rope_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_rope_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="fused_rope_forward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_forward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_rope_backward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_backward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_forward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_backward",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # TopK and MOE aux loss
-        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="fused_topk_with_score_function_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_topk_with_score_function_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Dropout
-        OpImpl(op_name="dropout_fwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="dropout_bwd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="dropout_fwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_fwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dropout_bwd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_bwd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # FP8 operations
-        OpImpl(op_name="fp8_transpose", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="swap_first_dims", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="compute_amax", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="fp8_transpose",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_transpose, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swap_first_dims",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swap_first_dims, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="compute_amax",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.compute_amax, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_amax_and_scale_update_after_reduction",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_compute_partial_amax",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_partial_cast",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Padding operations
-        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="fused_multi_row_padding",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_padding, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_multi_row_unpadding",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Library version getters
-        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="get_cudnn_version", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="get_cublasLt_version",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cublasLt_version, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # THD (Tensor, Hidden, Dimension) operations
-        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="thd_out_correction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="thd_grad_correction", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="thd_read_half_tensor",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_half_tensor, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_second_half_lse_correction",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_read_second_half_lse",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_out_correction",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_out_correction, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_grad_correction",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_grad_correction, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_get_partitioned_indices",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # NVSHMEM operations
-
         # Multi-tensor operations
-        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="multi_tensor_quantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_quantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_unscale_l2norm",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_fp8",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable_master",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_sgd",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_sgd, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_and_scale_inv",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Communication overlap operations
-        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="create_comm_overlap", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="HYGON", priority=100),
-        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="bulk_overlap_ag_with_external_gemm",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_fp8_tensor_meta",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_helper",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_p2p",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="HYGON", priority=100),
-
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Attention backend selection
-        OpImpl(op_name="get_attention_backend", impl_id="vendor.hygon", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="HYGON", priority=100),
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py
index ebf1092308..740c8d44d6 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/__init__.py
@@ -4,4 +4,4 @@
 
 from .iluvatar import IluvatarBackend
 
-__all__ = ["IluvatarBackend"]
\ No newline at end of file
+__all__ = ["IluvatarBackend"]
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index 294e79fcb9..40c1719851 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -9,6 +9,7 @@
 
 from ....ops import *
 
+
 def _load_iluvatar_libs():
     import ctypes
     import os
@@ -49,7 +50,7 @@ def try_load_lib(name, search_patterns):
 
         try:
             result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
-            for line in result.decode().split('\n'):
+            for line in result.decode().split("\n"):
                 if f"lib{name}" in line and "=>" in line:
                     so_path = line.split(">")[1].strip()
                     if so_path:
@@ -79,31 +80,39 @@ def try_load_lib(name, search_patterns):
         print(f"[ILUVATAR] Failed to load ILUVATAR libs: {e}")
         return False
 
+
 _iluvatar_libs_loaded = False
 
+
 def _ensure_iluvatar_libs():
     global _iluvatar_libs_loaded
     if not _iluvatar_libs_loaded:
         _iluvatar_libs_loaded = _load_iluvatar_libs()
     return _iluvatar_libs_loaded
 
+
 def _check_iluvatar_available() -> bool:
     if not torch.cuda.is_available():
         return False
     import os
+
     try:
         if not _ensure_iluvatar_libs():
-           return False
+            return False
         import transformer_engine_iluvatar
+
         return True
     except (ImportError, OSError) as e:
         print(f"[ILUVATAR] Import failed: {e}")
         return False
 
+
 def _get_tex():
     import transformer_engine_iluvatar.pytorch.ixte_torch
+
     return transformer_engine_iluvatar.pytorch.ixte_torch
 
+
 class IluvatarBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -123,6 +132,7 @@ def is_available(self) -> bool:
     def get_attention_backend(self, attention_params=None):
         from packaging.version import Version as PkgVersion
         from ....logger_manager import get_logger
+
         logger = get_logger()
 
         # Read environment variables to determine which backends to enable
@@ -152,7 +162,7 @@ def get_attention_backend(self, attention_params=None):
             available_backends,
         )
 
-##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -206,49 +216,78 @@ def generic_gemm(
         beta: Optional[float] = None,
     ) -> List[Any]:
         tex = self._get_tex()
-        
+
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
         output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
-            A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
-            accumulate, use_split_accumulator, comm_overlap, comm_type,
-            extra_output, bulk_overlap, alpha, beta
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
         )
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
+
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
+
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
+
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
+
     # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
+
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
+
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
+
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
+
     # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
+
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
+
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -258,39 +297,50 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
+
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
+
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
+
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
+
     # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
+
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
+
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
+
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
+
     # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
+
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
+
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -301,23 +351,33 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
     # DBias + DAct fusions #
     def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
     def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
     def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dqgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dsrelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-    # Permutation functions                                                                                                                                                                                                                                                                                             
+
+    # Permutation functions
     def moe_permute_fwd(
         self,
         input: torch.Tensor,
@@ -329,7 +389,10 @@ def moe_permute_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+        return tex.moe_permute_fwd(
+            input, dtype, indices, num_out_tokens, workspace, max_expanded_token_num
+        )
+
     def moe_permute_bwd(
         self,
         input: torch.Tensor,
@@ -341,7 +404,8 @@ def moe_permute_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_permute_bwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_fwd(
         self,
         input: torch.Tensor,
@@ -353,7 +417,8 @@ def moe_unpermute_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_unpermute_fwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_bwd(
         self,
         input_bwd: torch.Tensor,
@@ -364,7 +429,8 @@ def moe_unpermute_bwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+        return tex.moe_unpermute_bwd(input_bwd, input_fwd, dtype, row_id_map, prob)
+
     # Softmax functions
     def scaled_softmax_forward(
         self,
@@ -373,6 +439,7 @@ def scaled_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_forward(input, scale)
+
     def scaled_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -381,6 +448,7 @@ def scaled_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -389,6 +457,7 @@ def scaled_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+
     def scaled_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -397,6 +466,7 @@ def scaled_masked_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_upper_triang_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -404,6 +474,7 @@ def scaled_upper_triang_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+
     def scaled_upper_triang_masked_softmax_backward(
         self,
         output_grads_: torch.Tensor,
@@ -414,6 +485,7 @@ def scaled_upper_triang_masked_softmax_backward(
         return tex.scaled_upper_triang_masked_softmax_backward(
             output_grads_, softmax_results_, scale_factor
         )
+
     def scaled_aligned_causal_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -421,6 +493,7 @@ def scaled_aligned_causal_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+
     def scaled_aligned_causal_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -431,6 +504,7 @@ def scaled_aligned_causal_masked_softmax_backward(
         return tex.scaled_aligned_causal_masked_softmax_backward(
             output_grad_, softmax_results_, scale_factor
         )
+
     # Other granular functions
     def layernorm_fwd(
         self,
@@ -449,6 +523,7 @@ def layernorm_fwd(
         return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def layernorm_bwd(
         self,
         dz: torch.Tensor,
@@ -460,9 +535,8 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.layernorm_bwd(
-            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
-        )
+        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_fwd(
         self,
         input: Any,
@@ -479,6 +553,7 @@ def rmsnorm_fwd(
         return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def rmsnorm_bwd(
         self,
         dz: torch.Tensor,
@@ -490,6 +565,7 @@ def rmsnorm_bwd(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_bwd_add(
         self,
         dz: torch.Tensor,
@@ -510,6 +586,7 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -518,6 +595,7 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
+
     def te_general_grouped_gemm(
         self,
         A: List[Any],
@@ -542,10 +620,25 @@ def te_general_grouped_gemm(
         D_type = tex.DType(int(D_type)) if D_type is not None else None
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         return tex.te_general_grouped_gemm(
-            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
-            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
-            accumulate, use_split_accumulator, math_sm_count
+            A,
+            transa,
+            B,
+            transb,
+            D,
+            D_type,
+            m_splits,
+            bias,
+            bias_type,
+            single_output,
+            pre_gelu_out,
+            grad,
+            workspace,
+            workspaceSizes,
+            accumulate,
+            use_split_accumulator,
+            math_sm_count,
         )
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -555,6 +648,7 @@ def fp8_transpose(
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
         return tex.fp8_transpose(input, dtype, out)
+
     def swap_first_dims(
         self,
         tensor: torch.Tensor,
@@ -562,6 +656,7 @@ def swap_first_dims(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -588,14 +683,31 @@ def get_fused_attn_backend(
         kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         result = tex.get_fused_attn_backend(
-            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
-            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
-            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
-            head_dim_v, window_size_left, window_size_right, return_max_logit
+            is_training,
+            q_dtype,
+            kv_dtype,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            p_dropout,
+            num_attn_heads,
+            num_gqa_groups,
+            max_seqlen_q,
+            max_seqlen_kv,
+            head_dim_qk,
+            head_dim_v,
+            window_size_left,
+            window_size_right,
+            return_max_logit,
         )
         return NVTE_Fused_Attn_Backend(result)
 
@@ -606,6 +718,7 @@ def compute_amax(
     ) -> None:
         tex = self._get_tex()
         return tex.compute_amax(input, amax)
+
     def fused_amax_and_scale_update_after_reduction(
         self,
         amax_reduction_buffer: torch.Tensor,
@@ -618,9 +731,9 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales,
-            amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
         )
+
     def fp8_block_scaling_compute_partial_amax(
         self,
         tensor: torch.Tensor,
@@ -634,6 +747,7 @@ def fp8_block_scaling_compute_partial_amax(
         return tex.fp8_block_scaling_compute_partial_amax(
             tensor, amax, h, w, start_offset, block_len
         )
+
     def fp8_block_scaling_partial_cast(
         self,
         inp: torch.Tensor,
@@ -650,6 +764,7 @@ def fp8_block_scaling_partial_cast(
         return tex.fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -658,9 +773,8 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_padding(
-            input, output, input_row_list, padded_input_row_list
-        )
+        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
+
     def fused_multi_row_unpadding(
         self,
         input: torch.Tensor,
@@ -669,9 +783,7 @@ def fused_multi_row_unpadding(
         unpadded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(
-            input, output, input_row_list, unpadded_input_row_list
-        )
+        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
 
     # attention kernels
     def fa_prepare_fwd(
@@ -680,6 +792,7 @@ def fa_prepare_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_fwd(qkvi)
+
     def fa_prepare_bwd(
         self,
         q: torch.Tensor,
@@ -688,6 +801,7 @@ def fa_prepare_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_bwd(q, k, v)
+
     def fused_attn_fwd(
         self,
         max_seqlen_q: int,
@@ -723,8 +837,12 @@ def fused_attn_fwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         return tex.fused_attn_fwd(
             max_seqlen_q,
@@ -754,8 +872,9 @@ def fused_attn_fwd(
             SoftmaxOffset,
             rng_gen,
             rng_elts_per_thread,
-            return_max_logit
+            return_max_logit,
         )
+
     def fused_attn_bwd(
         self,
         max_seqlen_q: int,
@@ -789,8 +908,12 @@ def fused_attn_bwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
         dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
 
         return tex.fused_attn_bwd(
@@ -819,8 +942,9 @@ def fused_attn_bwd(
             cu_seqlens_kv_padded,
             s_quantizer,
             dp_quantizer,
-            dqkv_quantizer
+            dqkv_quantizer,
         )
+
     def copy_to_kv_cache(
         self,
         new_k: torch.Tensor,
@@ -852,8 +976,9 @@ def copy_to_kv_cache(
             max_ctx_len,
             max_seq_len,
             max_pages_per_seq,
-            is_non_paged
+            is_non_paged,
         )
+
     def convert_thd_to_bshd(
         self,
         tensor: torch.Tensor,
@@ -863,6 +988,7 @@ def convert_thd_to_bshd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+
     def convert_bshd_to_thd(
         self,
         tensor: torch.Tensor,
@@ -887,9 +1013,9 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
@@ -903,9 +1029,9 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_qkv_rope_forward(
         self,
         qkv_input: torch.Tensor,
@@ -921,10 +1047,17 @@ def fused_qkv_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_forward(
-            qkv_input, q_freqs, k_freqs, start_positions,
-            qkv_split_arg_list, qkv_format, interleaved,
-            cp_size, cp_rank
+            qkv_input,
+            q_freqs,
+            k_freqs,
+            start_positions,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
+
     def fused_qkv_rope_backward(
         self,
         q_grad_out: torch.Tensor,
@@ -941,9 +1074,16 @@ def fused_qkv_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_backward(
-            q_grad_out, k_grad_out, v_grad_out,
-            q_freqs, k_freqs, qkv_split_arg_list,
-            qkv_format, interleaved, cp_size, cp_rank
+            q_grad_out,
+            k_grad_out,
+            v_grad_out,
+            q_freqs,
+            k_freqs,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
 
     # fused router
@@ -969,6 +1109,7 @@ def fused_topk_with_score_function_fwd(
             score_function,
             expert_bias,
         )
+
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -993,6 +1134,7 @@ def fused_topk_with_score_function_bwd(
             scaling_factor,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
@@ -1005,6 +1147,7 @@ def fused_score_for_moe_aux_loss_fwd(
             topk,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -1023,6 +1166,7 @@ def fused_score_for_moe_aux_loss_bwd(
             topk,
             score_function,
         )
+
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -1045,6 +1189,7 @@ def fused_moe_aux_loss_fwd(
             topk,
             coeff,
         )
+
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -1054,7 +1199,9 @@ def fused_moe_aux_loss_bwd(
         grad_aux_loss: torch.Tensor,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
 
     # Dropout
     def dropout_fwd(
@@ -1065,6 +1212,7 @@ def dropout_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
+
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
@@ -1079,9 +1227,11 @@ def dropout_bwd(
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
+
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
+
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
@@ -1095,6 +1245,7 @@ def thd_read_half_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+
     def thd_second_half_lse_correction(
         self,
         lse: torch.Tensor,
@@ -1103,9 +1254,8 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(
-            lse, lse_per_step, cu_seqlens, lse_packed
-        )
+        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
+
     def thd_read_second_half_lse(
         self,
         lse: torch.Tensor,
@@ -1114,9 +1264,8 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(
-            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
-        )
+        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
+
     def thd_out_correction(
         self,
         out: torch.Tensor,
@@ -1129,9 +1278,9 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step,
-            cu_seqlens, only_second_half, lse_packed
+            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
         )
+
     def thd_grad_correction(
         self,
         grad: torch.Tensor,
@@ -1141,10 +1290,8 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(
-            grad, grad_per_step, cu_seqlens,
-            first_half, second_half
-        )
+        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
+
     def thd_get_partitioned_indices(
         self,
         cu_seqlens: torch.Tensor,
@@ -1153,9 +1300,7 @@ def thd_get_partitioned_indices(
         rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(
-            cu_seqlens, total_tokens, world_size, rank
-        )
+        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
 
     # nvshmem functions
     def init_nvshmem_backend(
@@ -1164,6 +1309,7 @@ def init_nvshmem_backend(
     ) -> None:
         tex = self._get_tex()
         return tex.init_nvshmem_backend(process_group)
+
     def create_nvshmem_tensor(
         self,
         shape: List[int],
@@ -1171,6 +1317,7 @@ def create_nvshmem_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.create_nvshmem_tensor(shape, dtype)
+
     def nvshmem_send_on_current_stream(
         self,
         src: torch.Tensor,
@@ -1180,6 +1327,7 @@ def nvshmem_send_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+
     def nvshmem_wait_on_current_stream(
         self,
         signal: torch.Tensor,
@@ -1187,6 +1335,7 @@ def nvshmem_wait_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
+
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
         return tex.nvshmem_finalize()
@@ -1201,6 +1350,7 @@ def multi_tensor_scale(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1210,6 +1360,7 @@ def multi_tensor_l2norm(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
@@ -1222,6 +1373,7 @@ def multi_tensor_unscale_l2norm(
         return tex.multi_tensor_unscale_l2norm(
             chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
         )
+
     def multi_tensor_adam(
         self,
         chunk_size: int,
@@ -1238,10 +1390,19 @@ def multi_tensor_adam(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_param_remainder(
         self,
         chunk_size: int,
@@ -1258,10 +1419,19 @@ def multi_tensor_adam_param_remainder(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_param_remainder(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_fp8(
         self,
         chunk_size: int,
@@ -1280,11 +1450,20 @@ def multi_tensor_adam_fp8(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.multi_tensor_adam_fp8(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            fp8_dtype
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            fp8_dtype,
         )
+
     def multi_tensor_adam_capturable(
         self,
         chunk_size: int,
@@ -1302,11 +1481,20 @@ def multi_tensor_adam_capturable(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_adam_capturable_master(
         self,
         chunk_size: int,
@@ -1324,11 +1512,20 @@ def multi_tensor_adam_capturable_master(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable_master(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_sgd(
         self,
         chunk_size: int,
@@ -1345,11 +1542,19 @@ def multi_tensor_sgd(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_sgd(
-            chunk_size, noop_flag, tensor_lists,
-            wd, momentum, dampening,
-            lr, nesterov, first_run,
-            wd_after_momentum, scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            wd,
+            momentum,
+            dampening,
+            lr,
+            nesterov,
+            first_run,
+            wd_after_momentum,
+            scale,
         )
+
     def multi_tensor_compute_scale_and_scale_inv(
         self,
         chunk_size: int,
@@ -1361,8 +1566,7 @@ def multi_tensor_compute_scale_and_scale_inv(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_compute_scale_and_scale_inv(
-            chunk_size, noop_flag, tensor_lists,
-            max_fp8, force_pow_2_scales, epsilon
+            chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
     # Comm+GEMM Overlap
@@ -1373,14 +1577,18 @@ def bulk_overlap_ag_with_external_gemm(
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
-        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+        return tex.bulk_overlap_ag_with_external_gemm(
+            allgather_communicator, send_stream, recv_stream
+        )
 
-############## class func #################################
+    ############## class func #################################
     def get_flash_attention_class(self):
         raise NotImplementedError("get_flash_attention_class - not implemented in iluvatar backend")
+
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
+
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
@@ -1388,6 +1596,7 @@ def create_comm_overlap_helper(
     ) -> "CommOverlapHelper":
         tex = self._get_tex()
         return tex.CommOverlapHelper(world_group, intra_node_group)
+
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1406,11 +1615,21 @@ def create_comm_overlap(
     ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
-            buffer_shape, buffer_dtype, helper, tp_size,
-            num_splits, num_max_streams, comm_cga_size,
-            gemm_priority, comm_priority, num_comm_sm,
-            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            num_splits,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            rs_overlap_first_gemm,
         )
+
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1430,7 +1649,18 @@ def create_comm_overlap_p2p(
     ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
-            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
-            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
-            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            comm_type,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            use_ce,
+            aggregate,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
index b136be2a51..f41724e3e2 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -46,160 +48,908 @@ def register_builtins(registry) -> None:
 
     impls = [
         # Normalization
-        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="layernorm_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="layernorm_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd_add",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # GEMM
-        OpImpl(op_name="generic_gemm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Quantization
-        OpImpl(op_name="quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dequantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="bgrad_quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="split_quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="quantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.quantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dequantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dequantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_quantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_quantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="split_quantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.split_quantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Activations - Forward
-        OpImpl(op_name="gelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="geglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="qgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="qgeglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="relu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="reglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="srelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="sreglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="silu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="swiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="clamped_swiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="gelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.gelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="geglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.geglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgeglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgeglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="relu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.relu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="reglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.reglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="srelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.srelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="sreglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.sreglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="silu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.silu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swiglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swiglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_swiglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_swiglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Activations - Backward
-        OpImpl(op_name="dgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dgeglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dqgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dqgeglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="drelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dreglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dsrelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dsreglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dsilu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dswiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="dgelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dgeglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgeglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgeglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgeglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="drelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.drelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dreglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dreglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsrelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsrelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsreglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsreglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsilu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsilu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dswiglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dswiglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_dswiglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_dswiglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Activations - Bias + Backward
-        OpImpl(op_name="dbias_dgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dbias_dsilu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dbias_drelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="dbias_dgelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dgelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsilu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsilu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_drelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_drelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dqgelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dqgelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsrelu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsrelu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Softmax
-        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="scaled_softmax_forward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_forward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_softmax_backward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_backward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_forward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_backward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_forward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_backward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # MOE operations
-        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="moe_permute_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_permute_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Fused attention
-        OpImpl(op_name="get_fused_attn_backend", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_attn_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_attn_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="get_fused_attn_backend",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_fused_attn_backend, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # KV cache
-        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="copy_to_kv_cache",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.copy_to_kv_cache, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Tensor format conversions
-        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="convert_thd_to_bshd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_bshd_to_thd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # RoPE (Rotary Position Embedding)
-        OpImpl(op_name="fused_rope_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_rope_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="fused_rope_forward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_forward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_rope_backward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_backward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_forward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_backward",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # TopK and MOE aux loss
-        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="fused_topk_with_score_function_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_topk_with_score_function_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Dropout
-        OpImpl(op_name="dropout_fwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="dropout_bwd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="dropout_fwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_fwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dropout_bwd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_bwd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # FP8 operations
-        OpImpl(op_name="fp8_transpose", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="swap_first_dims", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="compute_amax", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="fp8_transpose",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_transpose, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swap_first_dims",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swap_first_dims, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="compute_amax",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.compute_amax, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_amax_and_scale_update_after_reduction",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_compute_partial_amax",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_partial_cast",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Padding operations
-        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="fused_multi_row_padding",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_padding, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_multi_row_unpadding",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Library version getters
-        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="get_cudnn_version", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="get_cublasLt_version",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cublasLt_version, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # THD (Tensor, Hidden, Dimension) operations
-        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="thd_out_correction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="thd_grad_correction", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="thd_read_half_tensor",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_half_tensor, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_second_half_lse_correction",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_read_second_half_lse",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_out_correction",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_out_correction, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_grad_correction",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_grad_correction, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_get_partitioned_indices",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # NVSHMEM operations
-        OpImpl(op_name="init_nvshmem_backend", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="create_nvshmem_tensor", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="nvshmem_finalize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="init_nvshmem_backend",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.init_nvshmem_backend, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_nvshmem_tensor",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_send_on_current_stream",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_wait_on_current_stream",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_finalize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_finalize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Multi-tensor operations
-        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="multi_tensor_quantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_quantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_unscale_l2norm",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_fp8",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable_master",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_sgd",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_sgd, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_and_scale_inv",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Communication overlap operations
-        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="create_comm_overlap", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="Iluvatar", priority=100),
-        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="bulk_overlap_ag_with_external_gemm",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_fp8_tensor_meta",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_helper",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_p2p",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="Iluvatar", priority=100),
-
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Attention backend selection
-        OpImpl(op_name="get_attention_backend", impl_id="vendor.iluvatar", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="Iluvatar", priority=100),
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
index 7603553e42..7135566e95 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
@@ -107,7 +107,7 @@ def _create_sliding_window_mask(
             mask_bool = mask_bool | (kv_idx > q_idx + right_window)
 
         mask = torch.zeros(seq_len_q, seq_len_kv, dtype=dtype, device=device)
-        mask.masked_fill_(mask_bool, float('-inf'))
+        mask.masked_fill_(mask_bool, float("-inf"))
 
         return mask
 
@@ -128,7 +128,7 @@ def _unpack_tensor(
             else:
                 raise ValueError(
                     f"Unexpected 4D tensor shape {original_shape}. "
-                    f"Expected [total_tokens, 1, num_heads, head_dim]"
+                    "Expected [total_tokens, 1, num_heads, head_dim]"
                 )
 
         if tensor.dim() != 3:
@@ -145,8 +145,7 @@ def _unpack_tensor(
             )
 
         padded_tensor = torch.zeros(
-            batch_size, num_heads, max_seqlen, head_dim,
-            dtype=tensor.dtype, device=device
+            batch_size, num_heads, max_seqlen, head_dim, dtype=tensor.dtype, device=device
         )
 
         padding_mask = torch.ones(batch_size, max_seqlen, dtype=torch.bool, device=device)
@@ -175,8 +174,7 @@ def _pack_tensor(
         device = tensor.device
 
         packed_tensor = torch.zeros(
-            total_tokens, num_heads, head_dim,
-            dtype=tensor.dtype, device=device
+            total_tokens, num_heads, head_dim, dtype=tensor.dtype, device=device
         )
 
         for i in range(batch_size):
@@ -218,7 +216,9 @@ def _forward_impl(
         if fp8:
             raise NotImplementedError("FP8 is not supported in PyTorch SDPA backend")
         if cp_group is not None:
-            raise NotImplementedError("Context parallelism is not supported in PyTorch SDPA backend")
+            raise NotImplementedError(
+                "Context parallelism is not supported in PyTorch SDPA backend"
+            )
         if alibi_slopes is not None:
             raise NotImplementedError("ALiBi slopes are not supported in PyTorch SDPA backend")
 
@@ -245,12 +245,16 @@ def _forward_impl(
 
             if use_packed_format:
                 if cu_seqlens_q is not None:
-                    query, padding_mask_q = self._unpack_tensor(query_layer, cu_seqlens_q, max_seqlen_q)
+                    query, padding_mask_q = self._unpack_tensor(
+                        query_layer, cu_seqlens_q, max_seqlen_q
+                    )
                 else:
                     query = self._convert_layout_to_bhsd(query_layer, qkv_layout)
 
                 if cu_seqlens_kv is not None:
-                    key, padding_mask_kv = self._unpack_tensor(key_layer, cu_seqlens_kv, max_seqlen_kv)
+                    key, padding_mask_kv = self._unpack_tensor(
+                        key_layer, cu_seqlens_kv, max_seqlen_kv
+                    )
                     value, _ = self._unpack_tensor(value_layer, cu_seqlens_kv, max_seqlen_kv)
                 else:
                     key = self._convert_layout_to_bhsd(key_layer, qkv_layout)
@@ -268,7 +272,8 @@ def _forward_impl(
             num_groups = num_heads_q // num_heads_kv
             if num_heads_q % num_heads_kv != 0:
                 raise ValueError(
-                    f"num_heads_q ({num_heads_q}) must be divisible by num_heads_kv ({num_heads_kv})"
+                    f"num_heads_q ({num_heads_q}) must be divisible by num_heads_kv"
+                    f" ({num_heads_kv})"
                 )
             key = key.repeat_interleave(num_groups, dim=1)
             value = value.repeat_interleave(num_groups, dim=1)
@@ -278,11 +283,10 @@ def _forward_impl(
 
         if use_packed_format and padding_mask_kv is not None:
             attn_mask = torch.zeros(
-                batch_size, seq_len_q, seq_len_kv,
-                dtype=query.dtype, device=query.device
+                batch_size, seq_len_q, seq_len_kv, dtype=query.dtype, device=query.device
             )
             padding_broadcast = padding_mask_kv.unsqueeze(1)
-            attn_mask.masked_fill_(padding_broadcast, float('-inf'))
+            attn_mask.masked_fill_(padding_broadcast, float("-inf"))
 
         if attn_mask_type == "causal":
             is_causal = True
@@ -329,7 +333,7 @@ def _forward_impl(
 
             if explicit_mask.dtype == torch.bool:
                 float_mask = torch.zeros_like(explicit_mask, dtype=query.dtype)
-                float_mask.masked_fill_(~explicit_mask, float('-inf'))
+                float_mask.masked_fill_(~explicit_mask, float("-inf"))
                 explicit_mask = float_mask
 
             if explicit_mask.dim() == 2:
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
index 9d9bb164fa..6dbab926b2 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/kunlunxin.py
@@ -10,22 +10,18 @@
 
 _kunlunxin_available = False
 
+
 def _ensure_kunlunxin_available():
     global _kunlunxin_available
     if not _kunlunxin_available:
         try:
-            result = subprocess.run(
-                ["xpu-smi"],
-                capture_output=True,
-                timeout=10,
-                text=True
-            )
-            
+            result = subprocess.run(["xpu-smi"], capture_output=True, timeout=10, text=True)
+
             if result.returncode == 0:
                 _kunlunxin_available = True
             else:
                 _kunlunxin_available = False
-                
+
         except subprocess.TimeoutExpired:
             _kunlunxin_available = False
         except FileNotFoundError:
@@ -34,7 +30,7 @@ def _ensure_kunlunxin_available():
             _kunlunxin_available = False
         except Exception as e:
             _kunlunxin_available = False
-    
+
     return _kunlunxin_available
 
 
@@ -56,4 +52,5 @@ def is_available(self) -> bool:
 
     def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionTorch
+
         return FlashAttentionTorch
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
index 1585d0cf9d..fa014833b1 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -35,7 +37,7 @@ def register_builtins(registry) -> None:
 
     # Create a backend instance to access the methods
     backend = KunLunXinBackend()
-    
+
     if not backend.is_available():
         return
 
@@ -44,8 +46,14 @@ def register_builtins(registry) -> None:
 
     impls = [
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.kunlunxin", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="KUNLUNXIN", priority=100),
-
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="vendor.kunlunxin",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor="KUNLUNXIN",
+            priority=100,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/__init__.py b/transformer_engine/plugin/core/backends/vendor/metax/__init__.py
index f4e55f62e0..b663a97695 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/__init__.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/__init__.py
@@ -4,4 +4,4 @@
 
 from .metax import MetaxBackend
 
-__all__ = ["MetaxBackend"]
\ No newline at end of file
+__all__ = ["MetaxBackend"]
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
index 14044cef6a..49fdf56dde 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
@@ -31,12 +31,12 @@ def __init__(
 
         # Store initialization parameters for lazy loading
         self._init_params = {
-            'softmax_scale': softmax_scale,
-            'attention_dropout': attention_dropout,
-            'attention_dropout_ctx': attention_dropout_ctx or nullcontext,
-            'attention_type': attention_type,
-            'layer_number': layer_number,
-            'deterministic': deterministic,
+            "softmax_scale": softmax_scale,
+            "attention_dropout": attention_dropout,
+            "attention_dropout_ctx": attention_dropout_ctx or nullcontext,
+            "attention_type": attention_type,
+            "layer_number": layer_number,
+            "deterministic": deterministic,
         }
         self._metax_flash_attn = None
 
@@ -53,7 +53,9 @@ def _ensure_metax_flash_attn(self):
             )
 
             if FlashAttentionMetax is None:
-                raise RuntimeError("FlashAttention class is None - flash-attn may not be installed correctly")
+                raise RuntimeError(
+                    "FlashAttention class is None - flash-attn may not be installed correctly"
+                )
 
             self._metax_flash_attn = FlashAttentionMetax(**self._init_params)
 
@@ -64,8 +66,7 @@ def _ensure_metax_flash_attn(self):
             )
         except Exception as e:
             raise RuntimeError(
-                f"Failed to initialize metax FlashAttention: {e}. "
-                f"Init params: {self._init_params}"
+                f"Failed to initialize metax FlashAttention: {e}. Init params: {self._init_params}"
             )
 
     @property
@@ -124,4 +125,3 @@ def _forward_impl(
             flash_attention_backend=flash_attention_backend,
             fp8_output=fp8_output,
         )
-
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 6b33369c75..460ff76db4 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -16,6 +16,7 @@
 
 from ....ops import *
 
+
 def _load_metax_libs():
 
     def get_ext():
@@ -26,6 +27,7 @@ def get_ext():
 
     try:
         import transformer_engine_metax
+
         te_path = Path(importlib.util.find_spec("transformer_engine_metax").origin).parent.parent
         for search_dir in [te_path, te_path / "transformer_engine_metax"]:
             if search_dir.exists():
@@ -38,20 +40,24 @@ def get_ext():
         print(f"[Metax] Failed to load Metax libs: {e}")
         return False
 
+
 _metax_libs_loaded = False
 
+
 def _ensure_metax_libs():
     global _metax_libs_loaded
     if not _metax_libs_loaded:
         _metax_libs_loaded = _load_metax_libs()
     return _metax_libs_loaded
 
+
 def _check_metax_available() -> bool:
     if not torch.cuda.is_available():
         return False
 
     try:
         from ...._build_config import SKIP_METAX_BUILD
+
         if SKIP_METAX_BUILD:
             print("[Metax] Disabled: Metax was skipped at build time")
             return False
@@ -64,16 +70,20 @@ def _check_metax_available() -> bool:
         if not _ensure_metax_libs():
             return False
         import transformer_engine_torch_metax
+
         return True
     except (ImportError, OSError) as e:
         print(f"[Metax] Import failed: {e}")
         return False
 
+
 def _get_tex():
     _ensure_metax_libs()
     import transformer_engine_torch_metax
+
     return transformer_engine_torch_metax
 
+
 class MetaxBackend(TEFLBackendBase):
     @staticmethod
     def check_available() -> bool:
@@ -94,6 +104,7 @@ def get_attention_backend(self, attention_params=None):
         # Import the metax get_attention_backend function
         try:
             from transformer_engine_metax.pytorch.attention.dot_product_attention import utils
+
             return utils.get_attention_backend(attention_params)
 
         except ImportError as e:
@@ -103,11 +114,10 @@ def get_attention_backend(self, attention_params=None):
             )
         except Exception as e:
             raise RuntimeError(
-                f"Failed to get_attention_backend: {e}. "
-                f"Attention_params: {self.attention_params}"
+                f"Failed to get_attention_backend: {e}. Attention_params: {self.attention_params}"
             )
 
-##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -161,49 +171,78 @@ def generic_gemm(
         beta: Optional[float] = None,
     ) -> List[Any]:
         tex = self._get_tex()
-        
+
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
         output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
         return tex.generic_gemm(
-            A, transA, B, transB, D, quantizer, output_dtype,
-            bias, bias_type, gelu, gelu_in, grad, workspace, workspace_size,
-            accumulate, use_split_accumulator, comm_overlap, comm_type,
-            extra_output, bulk_overlap, alpha, beta
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
         )
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
+
     def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.geglu(input, quantizer)
+
     def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgelu(input, quantizer)
+
     def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.qgeglu(input, quantizer)
+
     # ReLU and variants #
     def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.relu(input, quantizer)
+
     def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.reglu(input, quantizer)
+
     def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.srelu(input, quantizer)
+
     def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.sreglu(input, quantizer)
+
     # SwiGLU and variants #
     def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.silu(input, quantizer)
+
     def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.swiglu(input, quantizer)
+
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -213,39 +252,50 @@ def clamped_swiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
+
     def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgeglu(grad, fwd_input, quantizer)
+
     def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgelu(grad, fwd_input, quantizer)
+
     def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dqgeglu(grad, fwd_input, quantizer)
+
     # Backward of ReLU and variants #
     def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.drelu(grad, fwd_input, quantizer)
+
     def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dreglu(grad, fwd_input, quantizer)
+
     def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsrelu(grad, fwd_input, quantizer)
+
     def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsreglu(grad, fwd_input, quantizer)
+
     # Backward of SiLU and variants #
     def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dsilu(grad, fwd_input, quantizer)
+
     def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dswiglu(grad, fwd_input, quantizer)
+
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -256,23 +306,33 @@ def clamped_dswiglu(
     ) -> Any:
         tex = self._get_tex()
         return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
     # DBias + DAct fusions #
     def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
     def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
     def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_drelu(grad, fwd_input, quantizer)
-    def dbias_dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dqgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dqgelu(grad, fwd_input, quantizer)
-    def dbias_dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+
+    def dbias_dsrelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
         tex = self._get_tex()
         return tex.dbias_dsrelu(grad, fwd_input, quantizer)
-    # Permutation functions                                                                                                                                                                                                                                                                                             
+
+    # Permutation functions
     def moe_permute_fwd(
         self,
         input: torch.Tensor,
@@ -284,7 +344,10 @@ def moe_permute_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_fwd(input, dtype,indices,num_out_tokens,workspace,max_expanded_token_num)
+        return tex.moe_permute_fwd(
+            input, dtype, indices, num_out_tokens, workspace, max_expanded_token_num
+        )
+
     def moe_permute_bwd(
         self,
         input: torch.Tensor,
@@ -296,7 +359,8 @@ def moe_permute_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_permute_bwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_permute_bwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_fwd(
         self,
         input: torch.Tensor,
@@ -308,7 +372,8 @@ def moe_unpermute_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_fwd(input,dtype,row_id_map,prob,num_tokens,topK)
+        return tex.moe_unpermute_fwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
     def moe_unpermute_bwd(
         self,
         input_bwd: torch.Tensor,
@@ -319,7 +384,8 @@ def moe_unpermute_bwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
-        return tex.moe_unpermute_bwd(input_bwd,input_fwd,dtype,row_id_map,prob)
+        return tex.moe_unpermute_bwd(input_bwd, input_fwd, dtype, row_id_map, prob)
+
     # Softmax functions
     def scaled_softmax_forward(
         self,
@@ -328,6 +394,7 @@ def scaled_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_forward(input, scale)
+
     def scaled_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -336,6 +403,7 @@ def scaled_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -344,6 +412,7 @@ def scaled_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+
     def scaled_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -352,6 +421,7 @@ def scaled_masked_softmax_backward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
     def scaled_upper_triang_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -359,6 +429,7 @@ def scaled_upper_triang_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+
     def scaled_upper_triang_masked_softmax_backward(
         self,
         output_grads_: torch.Tensor,
@@ -369,6 +440,7 @@ def scaled_upper_triang_masked_softmax_backward(
         return tex.scaled_upper_triang_masked_softmax_backward(
             output_grads_, softmax_results_, scale_factor
         )
+
     def scaled_aligned_causal_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -376,6 +448,7 @@ def scaled_aligned_causal_masked_softmax_forward(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+
     def scaled_aligned_causal_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -386,6 +459,7 @@ def scaled_aligned_causal_masked_softmax_backward(
         return tex.scaled_aligned_causal_masked_softmax_backward(
             output_grad_, softmax_results_, scale_factor
         )
+
     # Other granular functions
     def layernorm_fwd(
         self,
@@ -404,6 +478,7 @@ def layernorm_fwd(
         return tex.layernorm_fwd(
             input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def layernorm_bwd(
         self,
         dz: torch.Tensor,
@@ -415,9 +490,8 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.layernorm_bwd(
-            dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma
-        )
+        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_fwd(
         self,
         input: Any,
@@ -434,6 +508,7 @@ def rmsnorm_fwd(
         return tex.rmsnorm_fwd(
             input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
         )
+
     def rmsnorm_bwd(
         self,
         dz: torch.Tensor,
@@ -445,6 +520,7 @@ def rmsnorm_bwd(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
     def rmsnorm_bwd_add(
         self,
         dz: torch.Tensor,
@@ -465,6 +541,7 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -473,6 +550,7 @@ def split_quantize(
     ) -> List[Any]:
         tex = self._get_tex()
         return tex.split_quantize(tensor, split_sections, quantizer_list)
+
     def te_general_grouped_gemm(
         self,
         A: List[Any],
@@ -497,10 +575,25 @@ def te_general_grouped_gemm(
         D_type = tex.DType(int(D_type)) if D_type is not None else None
         bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
         return tex.te_general_grouped_gemm(
-            A, transa, B, transb, D, D_type, m_splits, bias, bias_type,
-            single_output, pre_gelu_out, grad, workspace, workspaceSizes,
-            accumulate, use_split_accumulator, math_sm_count
+            A,
+            transa,
+            B,
+            transb,
+            D,
+            D_type,
+            m_splits,
+            bias,
+            bias_type,
+            single_output,
+            pre_gelu_out,
+            grad,
+            workspace,
+            workspaceSizes,
+            accumulate,
+            use_split_accumulator,
+            math_sm_count,
         )
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -510,6 +603,7 @@ def fp8_transpose(
         tex = self._get_tex()
         dtype = tex.DType(int(dtype)) if dtype is not None else None
         return tex.fp8_transpose(input, dtype, out)
+
     def swap_first_dims(
         self,
         tensor: torch.Tensor,
@@ -517,6 +611,7 @@ def swap_first_dims(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -543,14 +638,31 @@ def get_fused_attn_backend(
         kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         result = tex.get_fused_attn_backend(
-            is_training, q_dtype, kv_dtype, qkv_layout, bias_type,
-            attn_mask_type, softmax_type, p_dropout, num_attn_heads,
-            num_gqa_groups, max_seqlen_q, max_seqlen_kv, head_dim_qk,
-            head_dim_v, window_size_left, window_size_right, return_max_logit
+            is_training,
+            q_dtype,
+            kv_dtype,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            p_dropout,
+            num_attn_heads,
+            num_gqa_groups,
+            max_seqlen_q,
+            max_seqlen_kv,
+            head_dim_qk,
+            head_dim_v,
+            window_size_left,
+            window_size_right,
+            return_max_logit,
         )
         return NVTE_Fused_Attn_Backend(result)
 
@@ -561,6 +673,7 @@ def compute_amax(
     ) -> None:
         tex = self._get_tex()
         return tex.compute_amax(input, amax)
+
     def fused_amax_and_scale_update_after_reduction(
         self,
         amax_reduction_buffer: torch.Tensor,
@@ -573,9 +686,9 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales,
-            amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
         )
+
     def fp8_block_scaling_compute_partial_amax(
         self,
         tensor: torch.Tensor,
@@ -589,6 +702,7 @@ def fp8_block_scaling_compute_partial_amax(
         return tex.fp8_block_scaling_compute_partial_amax(
             tensor, amax, h, w, start_offset, block_len
         )
+
     def fp8_block_scaling_partial_cast(
         self,
         inp: torch.Tensor,
@@ -605,6 +719,7 @@ def fp8_block_scaling_partial_cast(
         return tex.fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -613,9 +728,8 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_padding(
-            input, output, input_row_list, padded_input_row_list
-        )
+        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
+
     def fused_multi_row_unpadding(
         self,
         input: torch.Tensor,
@@ -624,9 +738,7 @@ def fused_multi_row_unpadding(
         unpadded_input_row_list: List[int],
     ) -> None:
         tex = self._get_tex()
-        return tex.fused_multi_row_unpadding(
-            input, output, input_row_list, unpadded_input_row_list
-        )
+        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
 
     # attention kernels
     def fa_prepare_fwd(
@@ -635,6 +747,7 @@ def fa_prepare_fwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_fwd(qkvi)
+
     def fa_prepare_bwd(
         self,
         q: torch.Tensor,
@@ -643,6 +756,7 @@ def fa_prepare_bwd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.fa_prepare_bwd(q, k, v)
+
     def fused_attn_fwd(
         self,
         max_seqlen_q: int,
@@ -678,8 +792,12 @@ def fused_attn_fwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
 
         return tex.fused_attn_fwd(
             max_seqlen_q,
@@ -709,8 +827,9 @@ def fused_attn_fwd(
             SoftmaxOffset,
             rng_gen,
             rng_elts_per_thread,
-            return_max_logit
+            return_max_logit,
         )
+
     def fused_attn_bwd(
         self,
         max_seqlen_q: int,
@@ -744,8 +863,12 @@ def fused_attn_bwd(
 
         qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
         bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
-        attn_mask_type = tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
-        softmax_type = tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
         dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
 
         return tex.fused_attn_bwd(
@@ -774,8 +897,9 @@ def fused_attn_bwd(
             cu_seqlens_kv_padded,
             s_quantizer,
             dp_quantizer,
-            dqkv_quantizer
+            dqkv_quantizer,
         )
+
     def copy_to_kv_cache(
         self,
         new_k: torch.Tensor,
@@ -807,8 +931,9 @@ def copy_to_kv_cache(
             max_ctx_len,
             max_seq_len,
             max_pages_per_seq,
-            is_non_paged
+            is_non_paged,
         )
+
     def convert_thd_to_bshd(
         self,
         tensor: torch.Tensor,
@@ -818,6 +943,7 @@ def convert_thd_to_bshd(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+
     def convert_bshd_to_thd(
         self,
         tensor: torch.Tensor,
@@ -842,9 +968,9 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
@@ -858,9 +984,9 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format,
-            interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
         )
+
     def fused_qkv_rope_forward(
         self,
         qkv_input: torch.Tensor,
@@ -876,10 +1002,17 @@ def fused_qkv_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_forward(
-            qkv_input, q_freqs, k_freqs, start_positions,
-            qkv_split_arg_list, qkv_format, interleaved,
-            cp_size, cp_rank
+            qkv_input,
+            q_freqs,
+            k_freqs,
+            start_positions,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
+
     def fused_qkv_rope_backward(
         self,
         q_grad_out: torch.Tensor,
@@ -896,9 +1029,16 @@ def fused_qkv_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_qkv_rope_backward(
-            q_grad_out, k_grad_out, v_grad_out,
-            q_freqs, k_freqs, qkv_split_arg_list,
-            qkv_format, interleaved, cp_size, cp_rank
+            q_grad_out,
+            k_grad_out,
+            v_grad_out,
+            q_freqs,
+            k_freqs,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
         )
 
     # fused router
@@ -924,6 +1064,7 @@ def fused_topk_with_score_function_fwd(
             score_function,
             expert_bias,
         )
+
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -948,6 +1089,7 @@ def fused_topk_with_score_function_bwd(
             scaling_factor,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
@@ -960,6 +1102,7 @@ def fused_score_for_moe_aux_loss_fwd(
             topk,
             score_function,
         )
+
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -978,6 +1121,7 @@ def fused_score_for_moe_aux_loss_bwd(
             topk,
             score_function,
         )
+
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -1000,6 +1144,7 @@ def fused_moe_aux_loss_fwd(
             topk,
             coeff,
         )
+
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -1009,7 +1154,9 @@ def fused_moe_aux_loss_bwd(
         grad_aux_loss: torch.Tensor,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.fused_moe_aux_loss_bwd(Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss)
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
 
     # Dropout
     def dropout_fwd(
@@ -1020,6 +1167,7 @@ def dropout_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.dropout_fwd(input, dropout_probability, out)
+
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
@@ -1034,9 +1182,11 @@ def dropout_bwd(
     def get_cublasLt_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cublasLt_version()
+
     def get_cudnn_version(self) -> int:
         tex = self._get_tex()
         return tex.get_cudnn_version()
+
     def get_num_cublas_streams(self) -> int:
         tex = self._get_tex()
         return tex.get_num_cublas_streams()
@@ -1050,6 +1200,7 @@ def thd_read_half_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+
     def thd_second_half_lse_correction(
         self,
         lse: torch.Tensor,
@@ -1058,9 +1209,8 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_second_half_lse_correction(
-            lse, lse_per_step, cu_seqlens, lse_packed
-        )
+        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
+
     def thd_read_second_half_lse(
         self,
         lse: torch.Tensor,
@@ -1069,9 +1219,8 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_read_second_half_lse(
-            lse, cu_seqlens, lse_packed, second_half_lse_seqlen
-        )
+        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
+
     def thd_out_correction(
         self,
         out: torch.Tensor,
@@ -1084,9 +1233,9 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step,
-            cu_seqlens, only_second_half, lse_packed
+            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
         )
+
     def thd_grad_correction(
         self,
         grad: torch.Tensor,
@@ -1096,10 +1245,8 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         tex = self._get_tex()
-        return tex.thd_grad_correction(
-            grad, grad_per_step, cu_seqlens,
-            first_half, second_half
-        )
+        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
+
     def thd_get_partitioned_indices(
         self,
         cu_seqlens: torch.Tensor,
@@ -1108,9 +1255,7 @@ def thd_get_partitioned_indices(
         rank: int,
     ) -> torch.Tensor:
         tex = self._get_tex()
-        return tex.thd_get_partitioned_indices(
-            cu_seqlens, total_tokens, world_size, rank
-        )
+        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
 
     # nvshmem functions
     def init_nvshmem_backend(
@@ -1119,6 +1264,7 @@ def init_nvshmem_backend(
     ) -> None:
         tex = self._get_tex()
         return tex.init_nvshmem_backend(process_group)
+
     def create_nvshmem_tensor(
         self,
         shape: List[int],
@@ -1126,6 +1272,7 @@ def create_nvshmem_tensor(
     ) -> torch.Tensor:
         tex = self._get_tex()
         return tex.create_nvshmem_tensor(shape, dtype)
+
     def nvshmem_send_on_current_stream(
         self,
         src: torch.Tensor,
@@ -1135,6 +1282,7 @@ def nvshmem_send_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+
     def nvshmem_wait_on_current_stream(
         self,
         signal: torch.Tensor,
@@ -1142,6 +1290,7 @@ def nvshmem_wait_on_current_stream(
     ) -> None:
         tex = self._get_tex()
         return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
+
     def nvshmem_finalize(self) -> None:
         tex = self._get_tex()
         return tex.nvshmem_finalize()
@@ -1156,6 +1305,7 @@ def multi_tensor_scale(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1165,6 +1315,7 @@ def multi_tensor_l2norm(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         tex = self._get_tex()
         return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
@@ -1177,6 +1328,7 @@ def multi_tensor_unscale_l2norm(
         return tex.multi_tensor_unscale_l2norm(
             chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
         )
+
     def multi_tensor_adam(
         self,
         chunk_size: int,
@@ -1193,10 +1345,19 @@ def multi_tensor_adam(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_param_remainder(
         self,
         chunk_size: int,
@@ -1213,10 +1374,19 @@ def multi_tensor_adam_param_remainder(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_param_remainder(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
         )
+
     def multi_tensor_adam_fp8(
         self,
         chunk_size: int,
@@ -1235,11 +1405,20 @@ def multi_tensor_adam_fp8(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.multi_tensor_adam_fp8(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            fp8_dtype
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            fp8_dtype,
         )
+
     def multi_tensor_adam_capturable(
         self,
         chunk_size: int,
@@ -1257,11 +1436,20 @@ def multi_tensor_adam_capturable(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_adam_capturable_master(
         self,
         chunk_size: int,
@@ -1279,11 +1467,20 @@ def multi_tensor_adam_capturable_master(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_adam_capturable_master(
-            chunk_size, noop_flag, tensor_lists,
-            lr, beta1, beta2, epsilon,
-            step, mode, bias_correction, weight_decay,
-            inv_scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
         )
+
     def multi_tensor_sgd(
         self,
         chunk_size: int,
@@ -1300,11 +1497,19 @@ def multi_tensor_sgd(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_sgd(
-            chunk_size, noop_flag, tensor_lists,
-            wd, momentum, dampening,
-            lr, nesterov, first_run,
-            wd_after_momentum, scale
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            wd,
+            momentum,
+            dampening,
+            lr,
+            nesterov,
+            first_run,
+            wd_after_momentum,
+            scale,
         )
+
     def multi_tensor_compute_scale_and_scale_inv(
         self,
         chunk_size: int,
@@ -1316,8 +1521,7 @@ def multi_tensor_compute_scale_and_scale_inv(
     ) -> None:
         tex = self._get_tex()
         return tex.multi_tensor_compute_scale_and_scale_inv(
-            chunk_size, noop_flag, tensor_lists,
-            max_fp8, force_pow_2_scales, epsilon
+            chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
     # Comm+GEMM Overlap
@@ -1328,15 +1532,20 @@ def bulk_overlap_ag_with_external_gemm(
         recv_stream: Any,
     ) -> Any:
         tex = self._get_tex()
-        return tex.bulk_overlap_ag_with_external_gemm(allgather_communicator, send_stream, recv_stream)
+        return tex.bulk_overlap_ag_with_external_gemm(
+            allgather_communicator, send_stream, recv_stream
+        )
 
-############## class func #################################
+    ############## class func #################################
     def get_flash_attention_class(self):
         from .flash_attention import FlashAttentionMETAX
+
         return FlashAttentionMETAX
+
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         tex = self._get_tex()
         return tex.FP8TensorMeta()
+
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
@@ -1344,6 +1553,7 @@ def create_comm_overlap_helper(
     ) -> "CommOverlapHelper":
         tex = self._get_tex()
         return tex.CommOverlapHelper(world_group, intra_node_group)
+
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1362,11 +1572,21 @@ def create_comm_overlap(
     ) -> "CommOverlap":
         tex = self._get_tex()
         return tex.CommOverlap(
-            buffer_shape, buffer_dtype, helper, tp_size,
-            num_splits, num_max_streams, comm_cga_size,
-            gemm_priority, comm_priority, num_comm_sm,
-            set_sm_margin, atomic_gemm, rs_overlap_first_gemm
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            num_splits,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            rs_overlap_first_gemm,
         )
+
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1386,7 +1606,18 @@ def create_comm_overlap_p2p(
     ) -> "CommOverlapP2P":
         tex = self._get_tex()
         return tex.CommOverlapP2P(
-            buffer_shape, buffer_dtype, helper, tp_size, comm_type,
-            num_max_streams, comm_cga_size, gemm_priority, comm_priority,
-            num_comm_sm, set_sm_margin, atomic_gemm, use_ce, aggregate
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            comm_type,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            use_ce,
+            aggregate,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
index a404bbbdc7..fd6c0cdafd 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
@@ -17,9 +17,11 @@
 
 def _bind_is_available(fn, is_available_fn):
     """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
         return fn(*args, **kwargs)
+
     wrapper._is_available = is_available_fn
     return wrapper
 
@@ -46,159 +48,908 @@ def register_builtins(registry) -> None:
 
     impls = [
         # Normalization
-        OpImpl(op_name="rmsnorm_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="rmsnorm_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="rmsnorm_bwd_add", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="layernorm_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="layernorm_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.layernorm_bwd, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd_add",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # GEMM
-        OpImpl(op_name="generic_gemm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.generic_gemm, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="te_general_grouped_gemm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Quantization
-        OpImpl(op_name="quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.quantize, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dequantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dequantize, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="bgrad_quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bgrad_quantize, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="split_quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.split_quantize, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="quantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.quantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dequantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dequantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_quantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_quantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="split_quantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.split_quantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Activations - Forward
-        OpImpl(op_name="gelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.gelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="geglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.geglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="qgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="qgeglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.qgeglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="relu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.relu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="reglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.reglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="srelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.srelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="sreglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.sreglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="silu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.silu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="swiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swiglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="clamped_swiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_swiglu, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="gelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.gelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="geglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.geglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgeglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgeglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="relu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.relu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="reglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.reglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="srelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.srelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="sreglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.sreglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="silu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.silu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swiglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swiglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_swiglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_swiglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Activations - Backward
-        OpImpl(op_name="dgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dgeglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dgeglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dqgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dqgeglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dqgeglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="drelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.drelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dreglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dreglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dsrelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsrelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dsreglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsreglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dsilu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dsilu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dswiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dswiglu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="clamped_dswiglu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.clamped_dswiglu, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="dgelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dgeglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgeglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgeglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgeglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="drelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.drelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dreglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dreglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsrelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsrelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsreglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsreglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsilu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsilu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dswiglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dswiglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_dswiglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_dswiglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Activations - Bias + Backward
-        OpImpl(op_name="dbias_dgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dgelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dbias_dsilu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsilu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dbias_drelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_drelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dbias_dqgelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dqgelu, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dbias_dsrelu", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dbias_dsrelu, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="dbias_dgelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dgelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsilu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsilu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_drelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_drelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dqgelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dqgelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsrelu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsrelu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Softmax
-        OpImpl(op_name="scaled_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_forward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_softmax_backward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_masked_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_upper_triang_masked_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="scaled_aligned_causal_masked_softmax_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="scaled_softmax_forward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_forward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_softmax_backward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_backward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_forward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_backward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_forward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_backward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # MOE operations
-        OpImpl(op_name="moe_permute_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="moe_permute_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_permute_bwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="moe_unpermute_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="moe_unpermute_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="moe_permute_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_permute_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Fused attention
-        OpImpl(op_name="get_fused_attn_backend", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_fused_attn_backend, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_attn_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_attn_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_attn_bwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fa_prepare_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fa_prepare_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fa_prepare_bwd, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="get_fused_attn_backend",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_fused_attn_backend, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # KV cache
-        OpImpl(op_name="copy_to_kv_cache", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.copy_to_kv_cache, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="copy_to_kv_cache",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.copy_to_kv_cache, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Tensor format conversions
-        OpImpl(op_name="convert_thd_to_bshd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="convert_bshd_to_thd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="convert_thd_to_bshd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_bshd_to_thd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # RoPE (Rotary Position Embedding)
-        OpImpl(op_name="fused_rope_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_forward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_rope_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_rope_backward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_qkv_rope_forward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_qkv_rope_backward", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="fused_rope_forward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_forward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_rope_backward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_backward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_forward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_backward",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # TopK and MOE aux loss
-        OpImpl(op_name="fused_topk_with_score_function_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_topk_with_score_function_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_score_for_moe_aux_loss_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_moe_aux_loss_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="fused_topk_with_score_function_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_topk_with_score_function_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Dropout
-        OpImpl(op_name="dropout_fwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_fwd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="dropout_bwd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.dropout_bwd, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="dropout_fwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_fwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dropout_bwd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_bwd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # FP8 operations
-        OpImpl(op_name="fp8_transpose", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_transpose, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="swap_first_dims", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.swap_first_dims, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="compute_amax", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.compute_amax, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_amax_and_scale_update_after_reduction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fp8_block_scaling_compute_partial_amax", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fp8_block_scaling_partial_cast", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="fp8_transpose",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_transpose, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swap_first_dims",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swap_first_dims, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="compute_amax",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.compute_amax, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_amax_and_scale_update_after_reduction",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_compute_partial_amax",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_partial_cast",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Padding operations
-        OpImpl(op_name="fused_multi_row_padding", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_padding, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="fused_multi_row_unpadding", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="fused_multi_row_padding",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_padding, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_multi_row_unpadding",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Library version getters
-        OpImpl(op_name="get_cublasLt_version", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cublasLt_version, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="get_cudnn_version", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_cudnn_version, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="get_num_cublas_streams", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_num_cublas_streams, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="get_cublasLt_version",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cublasLt_version, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # THD (Tensor, Hidden, Dimension) operations
-        OpImpl(op_name="thd_read_half_tensor", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_half_tensor, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="thd_second_half_lse_correction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="thd_read_second_half_lse", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="thd_out_correction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_out_correction, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="thd_grad_correction", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_grad_correction, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="thd_get_partitioned_indices", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="thd_read_half_tensor",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_half_tensor, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_second_half_lse_correction",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_read_second_half_lse",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_out_correction",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_out_correction, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_grad_correction",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_grad_correction, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_get_partitioned_indices",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # NVSHMEM operations
-        OpImpl(op_name="init_nvshmem_backend", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.init_nvshmem_backend, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="create_nvshmem_tensor", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="nvshmem_send_on_current_stream", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="nvshmem_wait_on_current_stream", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="nvshmem_finalize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.nvshmem_finalize, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="init_nvshmem_backend",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.init_nvshmem_backend, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_nvshmem_tensor",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_send_on_current_stream",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_wait_on_current_stream",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_finalize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_finalize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Multi-tensor operations
-        OpImpl(op_name="multi_tensor_quantize", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_quantize, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_scale", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_scale, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_l2norm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_unscale_l2norm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_adam", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_adam_param_remainder", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_adam_fp8", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_adam_capturable_master", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_sgd", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_sgd, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="multi_tensor_compute_scale_and_scale_inv", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="multi_tensor_quantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_quantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_unscale_l2norm",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_fp8",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable_master",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_sgd",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_sgd, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_and_scale_inv",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Communication overlap operations
-        OpImpl(op_name="bulk_overlap_ag_with_external_gemm", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="create_fp8_tensor_meta", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="create_comm_overlap_helper", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="create_comm_overlap", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap, is_avail), vendor="METAX", priority=100),
-        OpImpl(op_name="create_comm_overlap_p2p", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail), vendor="METAX", priority=100),
-
+        OpImpl(
+            op_name="bulk_overlap_ag_with_external_gemm",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_fp8_tensor_meta",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_helper",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_p2p",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # FlashAttention class getter
-        OpImpl(op_name="get_flash_attention_class", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_flash_attention_class, is_avail), vendor="METAX", priority=100),
-                # Attention backend selection
-        OpImpl(op_name="get_attention_backend", impl_id="vendor.metax", kind=BackendImplKind.VENDOR, fn=_bind_is_available(backend.get_attention_backend, is_avail), vendor="METAX", priority=100),
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        # Attention backend selection
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
index 0937a3649e..c194a543f3 100644
--- a/transformer_engine/plugin/core/builtin_ops.py
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -29,20 +29,23 @@ def register_builtins(registry: OpRegistry) -> None:
     # Register FlagOS (DEFAULT) implementations
     try:
         from .backends.flagos.register_ops import register_builtins as register_flagos
+
         register_flagos(registry)
     except Exception as e:
         print(f"[WARNING] Failed to register FlagOS operators: {e}")
-    
+
     # Register PyTorch (REFERENCE) implementations
     try:
         from .backends.reference.register_ops import register_builtins as register_reference
+
         register_reference(registry)
     except Exception as e:
         print(f"[WARNING] Failed to register Reference operators: {e}")
-    
+
     # Register CUDA (VENDOR) implementations
     try:
         from .backends.vendor.cuda.register_ops import register_builtins as register_cuda
+
         register_cuda(registry)
     except Exception as e:
         # CUDA may not be available, this is expected
@@ -51,6 +54,7 @@ def register_builtins(registry: OpRegistry) -> None:
     # Register HYGON (VENDOR) implementations
     try:
         from .backends.vendor.hygon.register_ops import register_builtins as register_hygon
+
         register_hygon(registry)
     except Exception as e:
         # HYGON may not be available, this is expected
@@ -59,6 +63,7 @@ def register_builtins(registry: OpRegistry) -> None:
     # Register Metax (VENDOR) implementations
     try:
         from .backends.vendor.metax.register_ops import register_builtins as register_metax
+
         register_metax(registry)
     except Exception as e:
         # Metax may not be available, this is expected
@@ -67,15 +72,17 @@ def register_builtins(registry: OpRegistry) -> None:
     # Register KUNLUNXIN (VENDOR) implementations
     try:
         from .backends.vendor.kunlunxin.register_ops import register_builtins as register_kunlunxin
+
         register_kunlunxin(registry)
     except Exception as e:
         # KunLunXin may not be available, this is expected
         pass
-    
+
     # Register Iluvatar (VENDOR) implementations
     try:
         from .backends.vendor.iluvatar.register_ops import register_builtins as register_iluvatar
+
         register_iluvatar(registry)
     except Exception as e:
         # Iluvatar may not be available, this is expected
-        pass
\ No newline at end of file
+        pass
diff --git a/transformer_engine/plugin/core/discovery.py b/transformer_engine/plugin/core/discovery.py
index cc6280eda7..cfde3f4774 100644
--- a/transformer_engine/plugin/core/discovery.py
+++ b/transformer_engine/plugin/core/discovery.py
@@ -19,18 +19,23 @@
 
 _discovered_plugin: List[Tuple[str, str, bool]] = []
 
+
 def _log_debug(msg: str) -> None:
     logger.debug(msg)
 
+
 def _log_info(msg: str) -> None:
     logger.info(msg)
 
+
 def _log_warning(msg: str) -> None:
     logger.warning(msg)
 
+
 def _log_error(msg: str) -> None:
     logger.error(msg)
 
+
 def _get_entry_points():
     try:
         from importlib.metadata import entry_points
@@ -59,6 +64,7 @@ def _get_entry_points():
         _log_warning(f"Error accessing entry points: {e}")
         return []
 
+
 def _call_register_function(
     obj: Any,
     registry_module: Any,
@@ -87,6 +93,7 @@ def _call_register_function(
     _log_debug(f"No register function found in {source_name}")
     return False
 
+
 def discover_from_entry_points(registry_module: Any) -> int:
     loaded = 0
     entry_points_list = _get_entry_points()
@@ -115,6 +122,7 @@ def discover_from_entry_points(registry_module: Any) -> int:
 
     return loaded
 
+
 def discover_from_env_modules(registry_module: Any) -> int:
     modules_str = os.environ.get(PLUGIN_MODULES_ENV, "").strip()
 
@@ -146,6 +154,7 @@ def discover_from_env_modules(registry_module: Any) -> int:
 
     return loaded
 
+
 def discover_plugin(registry_module: Any) -> int:
     """
     Main plugin discovery function.
@@ -176,15 +185,16 @@ def discover_plugin(registry_module: Any) -> int:
 
     return total
 
+
 # Alias for compatibility with different naming conventions
 discover_op_plugin = discover_plugin
 
+
 def get_discovered_plugin() -> List[Tuple[str, str, bool]]:
     """Get list of discovered plugin (name, source, success)"""
     return _discovered_plugin.copy()
 
+
 def clear_discovered_plugin() -> None:
     """Clear the discovered plugin list (for testing)"""
     _discovered_plugin.clear()
-
-
diff --git a/transformer_engine/plugin/core/logger_manager.py b/transformer_engine/plugin/core/logger_manager.py
index 682122c346..899d067e3e 100644
--- a/transformer_engine/plugin/core/logger_manager.py
+++ b/transformer_engine/plugin/core/logger_manager.py
@@ -7,6 +7,7 @@
 import os
 import threading
 
+
 class Logger:
     def __init__(self, name, level=logging.INFO):
         self.logger = logging.getLogger(name)
@@ -60,12 +61,13 @@ def debug_once(self, message):
             self._printed_once.add(message)
             self.logger.debug(message, stacklevel=2)
 
+
 class LoggerManager:
     _instance = None
     _lock = threading.Lock()
 
     def __init__(self):
-        if hasattr(self, '_global_logger'):
+        if hasattr(self, "_global_logger"):
             return
 
         self._global_logger = None
@@ -114,11 +116,14 @@ def reset(self):
                 self._global_logger = None
                 self._global_printed_once.clear()
 
+
 def get_logger():
     return LoggerManager.get_instance().get_logger()
 
+
 def print_once(message):
     LoggerManager.get_instance().print_once(message)
 
+
 def debug_print_once(func_name: str, backend_name: str = "Backend", *args, **kwargs):
-    LoggerManager.get_instance().debug_print_once(func_name, backend_name, *args, **kwargs)
\ No newline at end of file
+    LoggerManager.get_instance().debug_print_once(func_name, backend_name, *args, **kwargs)
diff --git a/transformer_engine/plugin/core/manager.py b/transformer_engine/plugin/core/manager.py
index 66a9ad8d9b..0a53c11f31 100644
--- a/transformer_engine/plugin/core/manager.py
+++ b/transformer_engine/plugin/core/manager.py
@@ -21,6 +21,7 @@
 @dataclass
 class _OpManagerState:
     """Internal state for OpManager"""
+
     init_pid: int = -1
     initialized: bool = False
     policy_epoch: int = 0
@@ -103,6 +104,7 @@ def ensure_initialized(self) -> None:
 
             # Register built-in operators
             from . import builtin_ops
+
             builtin_ops.register_builtins(self._registry)
 
             # Discover and register plugin
@@ -117,21 +119,39 @@ def ensure_initialized(self) -> None:
             total_ops = len(snap.impls_by_op)
             total_impls = sum(len(impls) for impls in snap.impls_by_op.values())
 
-            logger.info(f"OpManager initialized: {total_ops} ops with {total_impls} implementations")
+            logger.info(
+                f"OpManager initialized: {total_ops} ops with {total_impls} implementations"
+            )
 
             # Group implementations by kind for summary
-            vendor_count = sum(1 for impls in snap.impls_by_op.values()
-                             for impl in impls if impl.kind == BackendImplKind.VENDOR)
-            reference_count = sum(1 for impls in snap.impls_by_op.values()
-                                for impl in impls if impl.kind == BackendImplKind.REFERENCE)
-            default_count = sum(1 for impls in snap.impls_by_op.values()
-                              for impl in impls if impl.kind == BackendImplKind.DEFAULT)
+            vendor_count = sum(
+                1
+                for impls in snap.impls_by_op.values()
+                for impl in impls
+                if impl.kind == BackendImplKind.VENDOR
+            )
+            reference_count = sum(
+                1
+                for impls in snap.impls_by_op.values()
+                for impl in impls
+                if impl.kind == BackendImplKind.REFERENCE
+            )
+            default_count = sum(
+                1
+                for impls in snap.impls_by_op.values()
+                for impl in impls
+                if impl.kind == BackendImplKind.DEFAULT
+            )
 
-            logger.debug(f"  Vendor: {vendor_count}, Default: {default_count}, Reference: {reference_count}")
+            logger.debug(
+                f"  Vendor: {vendor_count}, Default: {default_count}, Reference: {reference_count}"
+            )
 
             # List all registered impl_ids
             if logger.logger.isEnabledFor(logger.logger.level):
-                impl_ids = sorted(set(impl.impl_id for impls in snap.impls_by_op.values() for impl in impls))
+                impl_ids = sorted(
+                    set(impl.impl_id for impls in snap.impls_by_op.values() for impl in impls)
+                )
                 logger.info(f"Registered impl_ids: {impl_ids}")
 
     def _matches_vendor_filters(self, impl: OpImpl, policy: SelectionPolicy) -> bool:
@@ -374,7 +394,8 @@ def call(self, op_name: str, *args, **kwargs):
             except Exception as e:
                 if enable_fallback:
                     logger.warning_once(
-                        f"Cached implementation '{cached_impl.impl_id}' failed for op '{op_name}': {e}"
+                        f"Cached implementation '{cached_impl.impl_id}' failed for op"
+                        f" '{op_name}': {e}"
                     )
                     self._invalidate_cache(op_name)
                 else:
@@ -397,8 +418,9 @@ def call(self, op_name: str, *args, **kwargs):
                         )
                     elif last_impl_id != candidate.impl_id:
                         logger.info_once(
-                            f"Op '{op_name}' switched from '{last_impl_id}' to '{candidate.impl_id}' "
-                            f"(kind={candidate.kind.value}, vendor={candidate.vendor})"
+                            f"Op '{op_name}' switched from '{last_impl_id}' to"
+                            f" '{candidate.impl_id}' (kind={candidate.kind.value},"
+                            f" vendor={candidate.vendor})"
                         )
                     break
 
@@ -477,7 +499,8 @@ def call_with_custom_impl(
             except Exception as e:
                 if enable_fallback:
                     logger.warning_once(
-                        f"Cached implementation '{cached_impl.impl_id}' failed for op '{op_name}': {e}"
+                        f"Cached implementation '{cached_impl.impl_id}' failed for op"
+                        f" '{op_name}': {e}"
                     )
                     self._invalidate_cache(op_name)
                 else:
@@ -502,8 +525,8 @@ def call_with_custom_impl(
                             )
                         elif last_impl_id != impl.impl_id:
                             logger.info_once(
-                                f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}' "
-                                f"(kind={impl.kind.value}, vendor={impl.vendor})"
+                                f"Op '{op_name}' switched from '{last_impl_id}' to '{impl.impl_id}'"
+                                f" (kind={impl.kind.value}, vendor={impl.vendor})"
                             )
                         return result
                 except Exception:
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 74357394e8..7e39bef7a3 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -9,8 +9,10 @@
 import torch
 
 from .logger_manager import get_logger
+
 logger = get_logger()
 
+
 ################### Enums ###################
 class DType(IntEnum):
     kByte = 0
@@ -26,10 +28,12 @@ class DType(IntEnum):
     kFloat4E2M1 = 10
     kNumTypes = 11
 
+
 class Float8BlockScaleTensorFormat(IntEnum):
     GEMM_READY = 0
     COMPACT = 1
 
+
 class NVTE_Activation_Type(IntEnum):
     GELU = 0
     GEGLU = 1
@@ -43,15 +47,18 @@ class NVTE_Activation_Type(IntEnum):
     SREGLU = 9
     CLAMPED_SWIGLU = 10
 
+
 class NVTE_Softmax_Type(IntEnum):
     NVTE_VANILLA_SOFTMAX = 0
     NVTE_OFF_BY_ONE_SOFTMAX = 1
     NVTE_LEARNABLE_SOFTMAX = 2
 
+
 class CommGemmOverlapRole(IntEnum):
     INPUT = 0
     OUTPUT = 1
 
+
 class FP8FwdTensors(IntEnum):
     GEMM1_INPUT = 0
     GEMM1_WEIGHT = 1
@@ -63,6 +70,7 @@ class FP8FwdTensors(IntEnum):
     GEMM3_WEIGHT = 7
     GEMM3_OUTPUT = 8
 
+
 class FP8BwdTensors(IntEnum):
     GRAD_OUTPUT1 = 0
     GRAD_INPUT1 = 1
@@ -71,12 +79,14 @@ class FP8BwdTensors(IntEnum):
     GRAD_OUTPUT3 = 4
     GRAD_INPUT3 = 5
 
+
 class NVTE_Bias_Type(IntEnum):
     NVTE_NO_BIAS = 0
     NVTE_PRE_SCALE_BIAS = 1
     NVTE_POST_SCALE_BIAS = 2
     NVTE_ALIBI = 3
 
+
 class NVTE_Mask_Type(IntEnum):
     NVTE_NO_MASK = 0
     NVTE_PADDING_MASK = 1
@@ -85,12 +95,14 @@ class NVTE_Mask_Type(IntEnum):
     NVTE_CAUSAL_BOTTOM_RIGHT_MASK = 4
     NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK = 5
 
+
 class NVTE_Fused_Attn_Backend(IntEnum):
     NVTE_No_Backend = -1
     NVTE_F16_max512_seqlen = 0
     NVTE_F16_arbitrary_seqlen = 1
     NVTE_FP8 = 2
 
+
 class NVTE_QKV_Format(IntEnum):
     NVTE_SBHD = 0
     NVTE_BSHD = 1
@@ -100,6 +112,7 @@ class NVTE_QKV_Format(IntEnum):
     NVTE_THD_2BSHD = 5
     NVTE_THD_2SBHD = 6
 
+
 class NVTE_QKV_Layout(IntEnum):
     NVTE_SB3HD = 0
     NVTE_SBH3D = 1
@@ -127,10 +140,12 @@ class NVTE_QKV_Layout(IntEnum):
     NVTE_Paged_KV_THD_BSHD_BSHD = 23
     NVTE_Paged_KV_THD_SBHD_SBHD = 24
 
+
 class CommOverlapType(IntEnum):
     RS = 0
     AG = 1
 
+
 class CommOverlapAlgo(IntEnum):
     BULK_OVERLAP_AG = 0
     BULK_OVERLAP_RS = 1
@@ -142,40 +157,54 @@ class CommOverlapAlgo(IntEnum):
     ATOMIC_GEMM_RS_P2P = 7
     EXTERNAL_BULK_OVERLAP_AG = 8
 
+
 ############ Class #################
 
+
 class FP8TensorMeta:
     """
     FP8TensorMeta wrapper that routes to the appropriate backend implementation.
     """
+
     def __new__(cls, *args, **kwargs):
         from .manager import get_default_manager
+
         return get_default_manager().call("create_fp8_tensor_meta", *args, **kwargs)
 
+
 class CommOverlapHelper:
     """
     CommOverlapHelper wrapper that routes to the appropriate backend implementation.
     """
+
     def __new__(cls, *args, **kwargs):
         from .manager import get_default_manager
+
         return get_default_manager().call("create_comm_overlap_helper", *args, **kwargs)
 
+
 class CommOverlap:
     """
     CommOverlap wrapper that routes to the appropriate backend implementation.
     """
+
     def __new__(cls, *args, **kwargs):
         from .manager import get_default_manager
+
         return get_default_manager().call("create_comm_overlap", *args, **kwargs)
 
+
 class CommOverlapP2P:
     """
     CommOverlapP2P wrapper that routes to the appropriate backend implementation.
     """
+
     def __new__(cls, *args, **kwargs):
         from .manager import get_default_manager
+
         return get_default_manager().call("create_comm_overlap_p2p", *args, **kwargs)
 
+
 class FlashAttentionBase(torch.nn.Module, ABC):
     def __init__(
         self,
@@ -352,6 +381,7 @@ def call_impl_fn(impl_class):
     def backend_name(self) -> str:
         return self.__class__.__name__
 
+
 ############ Base  ###################
 class TEFLBackendBase(ABC):
     @abstractmethod
@@ -361,7 +391,7 @@ def is_available(self) -> bool:
     def get_attention_backend(self, attention_params=None):
         raise NotImplementedError
 
-##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
     def quantize(
         self,
         tensor: torch.Tensor,
@@ -419,24 +449,28 @@ def gelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def geglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def qgelu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def qgeglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     # ReLU and variants #
     def relu(
         self,
@@ -444,24 +478,28 @@ def relu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def reglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def srelu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def sreglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     # SwiGLU and variants #
     def silu(
         self,
@@ -469,12 +507,14 @@ def silu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def swiglu(
         self,
         input: torch.Tensor,
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def clamped_swiglu(
         self,
         input: torch.Tensor,
@@ -483,6 +523,7 @@ def clamped_swiglu(
         alpha: float = 1.702,
     ) -> Any:
         raise NotImplementedError
+
     # Backward of GELU and variants #
     def dgelu(
         self,
@@ -491,6 +532,7 @@ def dgelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dgeglu(
         self,
         grad: torch.Tensor,
@@ -498,6 +540,7 @@ def dgeglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dqgelu(
         self,
         grad: torch.Tensor,
@@ -505,6 +548,7 @@ def dqgelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dqgeglu(
         self,
         grad: torch.Tensor,
@@ -512,6 +556,7 @@ def dqgeglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     # Backward of ReLU and variants #
     def drelu(
         self,
@@ -520,6 +565,7 @@ def drelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dreglu(
         self,
         grad: torch.Tensor,
@@ -527,6 +573,7 @@ def dreglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dsrelu(
         self,
         grad: torch.Tensor,
@@ -534,6 +581,7 @@ def dsrelu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dsreglu(
         self,
         grad: torch.Tensor,
@@ -541,6 +589,7 @@ def dsreglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     # Backward of SiLU and variants #
     def dsilu(
         self,
@@ -549,6 +598,7 @@ def dsilu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def dswiglu(
         self,
         grad: torch.Tensor,
@@ -556,6 +606,7 @@ def dswiglu(
         quantizer: Any,
     ) -> Any:
         raise NotImplementedError
+
     def clamped_dswiglu(
         self,
         grad: torch.Tensor,
@@ -565,6 +616,7 @@ def clamped_dswiglu(
         alpha: float = 1.702,
     ) -> Any:
         raise NotImplementedError
+
     # DBias + DAct fusions #
     def dbias_dgelu(
         self,
@@ -573,6 +625,7 @@ def dbias_dgelu(
         quantizer: Any,
     ) -> List[Any]:
         raise NotImplementedError
+
     def dbias_dsilu(
         self,
         grad: torch.Tensor,
@@ -580,6 +633,7 @@ def dbias_dsilu(
         quantizer: Any,
     ) -> List[Any]:
         raise NotImplementedError
+
     def dbias_drelu(
         self,
         grad: torch.Tensor,
@@ -587,6 +641,7 @@ def dbias_drelu(
         quantizer: Any,
     ) -> List[Any]:
         raise NotImplementedError
+
     def dbias_dqgelu(
         self,
         grad: torch.Tensor,
@@ -594,6 +649,7 @@ def dbias_dqgelu(
         quantizer: Any,
     ) -> List[Any]:
         raise NotImplementedError
+
     def dbias_dsrelu(
         self,
         grad: torch.Tensor,
@@ -601,7 +657,8 @@ def dbias_dsrelu(
         quantizer: Any,
     ) -> List[Any]:
         raise NotImplementedError
-    # Permutation functions                                                                                                                                                                                                                                                                                             
+
+    # Permutation functions
     def moe_permute_fwd(
         self,
         input: torch.Tensor,
@@ -612,6 +669,7 @@ def moe_permute_fwd(
         max_expanded_token_num: int,
     ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         raise NotImplementedError
+
     def moe_permute_bwd(
         self,
         input: torch.Tensor,
@@ -622,6 +680,7 @@ def moe_permute_bwd(
         topK: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def moe_unpermute_fwd(
         self,
         input: torch.Tensor,
@@ -632,6 +691,7 @@ def moe_unpermute_fwd(
         topK: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def moe_unpermute_bwd(
         self,
         input_bwd: torch.Tensor,
@@ -641,6 +701,7 @@ def moe_unpermute_bwd(
         prob: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     # Softmax functions
     def scaled_softmax_forward(
         self,
@@ -648,6 +709,7 @@ def scaled_softmax_forward(
         scale: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -655,6 +717,7 @@ def scaled_softmax_backward(
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_masked_softmax_forward(
         self,
         input: torch.Tensor,
@@ -662,6 +725,7 @@ def scaled_masked_softmax_forward(
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -669,12 +733,14 @@ def scaled_masked_softmax_backward(
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_upper_triang_masked_softmax_forward(
         self,
         input: torch.Tensor,
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_upper_triang_masked_softmax_backward(
         self,
         output_grads_: torch.Tensor,
@@ -682,12 +748,14 @@ def scaled_upper_triang_masked_softmax_backward(
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_aligned_causal_masked_softmax_forward(
         self,
         input: torch.Tensor,
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def scaled_aligned_causal_masked_softmax_backward(
         self,
         output_grad_: torch.Tensor,
@@ -695,6 +763,7 @@ def scaled_aligned_causal_masked_softmax_backward(
         scale_factor: float,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     # Other granular functions
     def layernorm_fwd(
         self,
@@ -709,6 +778,7 @@ def layernorm_fwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         raise NotImplementedError
+
     def layernorm_bwd(
         self,
         dz: torch.Tensor,
@@ -720,6 +790,7 @@ def layernorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         raise NotImplementedError
+
     def rmsnorm_fwd(
         self,
         input: Any,
@@ -732,6 +803,7 @@ def rmsnorm_fwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         raise NotImplementedError
+
     def rmsnorm_bwd(
         self,
         dz: torch.Tensor,
@@ -742,6 +814,7 @@ def rmsnorm_bwd(
         zero_centered_gamma: bool,
     ) -> List[Any]:
         raise NotImplementedError
+
     def rmsnorm_bwd_add(
         self,
         dz: torch.Tensor,
@@ -760,6 +833,7 @@ def multi_tensor_quantize(
         quantizer_list: List[Any],
     ) -> List[Any]:
         raise NotImplementedError
+
     def split_quantize(
         self,
         tensor: torch.Tensor,
@@ -767,6 +841,7 @@ def split_quantize(
         quantizer_list: List[Any],
     ) -> List[Any]:
         raise NotImplementedError
+
     def te_general_grouped_gemm(
         self,
         A: List[Any],
@@ -788,6 +863,7 @@ def te_general_grouped_gemm(
         math_sm_count: int,
     ) -> Optional[List[torch.Tensor]]:
         raise NotImplementedError
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -795,12 +871,14 @@ def fp8_transpose(
         out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def swap_first_dims(
         self,
         tensor: torch.Tensor,
         out: Optional[torch.Tensor],
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -829,6 +907,7 @@ def compute_amax(
         amax: torch.Tensor,
     ) -> None:
         raise NotImplementedError
+
     def fused_amax_and_scale_update_after_reduction(
         self,
         amax_reduction_buffer: torch.Tensor,
@@ -839,6 +918,7 @@ def fused_amax_and_scale_update_after_reduction(
         margin: float,
     ) -> None:
         raise NotImplementedError
+
     def fp8_block_scaling_compute_partial_amax(
         self,
         tensor: torch.Tensor,
@@ -849,6 +929,7 @@ def fp8_block_scaling_compute_partial_amax(
         block_len: int,
     ) -> None:
         raise NotImplementedError
+
     def fp8_block_scaling_partial_cast(
         self,
         inp: torch.Tensor,
@@ -861,6 +942,7 @@ def fp8_block_scaling_partial_cast(
         out_dtype: DType,
     ) -> None:
         raise NotImplementedError
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -869,6 +951,7 @@ def fused_multi_row_padding(
         padded_input_row_list: List[int],
     ) -> None:
         raise NotImplementedError
+
     def fused_multi_row_unpadding(
         self,
         input: torch.Tensor,
@@ -884,6 +967,7 @@ def fa_prepare_fwd(
         qkvi: torch.Tensor,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def fa_prepare_bwd(
         self,
         q: torch.Tensor,
@@ -891,6 +975,7 @@ def fa_prepare_bwd(
         v: torch.Tensor,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def fused_attn_fwd(
         self,
         max_seqlen_q: int,
@@ -923,6 +1008,7 @@ def fused_attn_fwd(
         return_max_logit: bool,
     ) -> List[Any]:
         raise NotImplementedError
+
     def fused_attn_bwd(
         self,
         max_seqlen_q: int,
@@ -953,6 +1039,7 @@ def fused_attn_bwd(
         dqkv_quantizer: Any,
     ) -> List[Any]:
         raise NotImplementedError
+
     def copy_to_kv_cache(
         self,
         new_k: torch.Tensor,
@@ -970,6 +1057,7 @@ def copy_to_kv_cache(
         is_non_paged: bool,
     ) -> None:
         raise NotImplementedError
+
     def convert_thd_to_bshd(
         self,
         tensor: torch.Tensor,
@@ -978,6 +1066,7 @@ def convert_thd_to_bshd(
         max_seq_len: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def convert_bshd_to_thd(
         self,
         tensor: torch.Tensor,
@@ -999,6 +1088,7 @@ def fused_rope_forward(
         cp_rank: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
@@ -1010,6 +1100,7 @@ def fused_rope_backward(
         cp_rank: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def fused_qkv_rope_forward(
         self,
         qkv_input: torch.Tensor,
@@ -1023,6 +1114,7 @@ def fused_qkv_rope_forward(
         cp_rank: int,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def fused_qkv_rope_backward(
         self,
         q_grad_out: torch.Tensor,
@@ -1051,6 +1143,7 @@ def fused_topk_with_score_function_fwd(
         expert_bias: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def fused_topk_with_score_function_bwd(
         self,
         num_tokens: int,
@@ -1064,6 +1157,7 @@ def fused_topk_with_score_function_bwd(
         score_function: str,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def fused_score_for_moe_aux_loss_fwd(
         self,
         logits: torch.Tensor,
@@ -1071,6 +1165,7 @@ def fused_score_for_moe_aux_loss_fwd(
         score_function: str,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def fused_score_for_moe_aux_loss_bwd(
         self,
         num_tokens: int,
@@ -1081,6 +1176,7 @@ def fused_score_for_moe_aux_loss_bwd(
         score_function: str,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def fused_moe_aux_loss_fwd(
         self,
         probs: torch.Tensor,
@@ -1093,6 +1189,7 @@ def fused_moe_aux_loss_fwd(
         coeff: float,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def fused_moe_aux_loss_bwd(
         self,
         Const_buf: torch.Tensor,
@@ -1111,6 +1208,7 @@ def dropout_fwd(
         out: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def dropout_bwd(
         self,
         grad_output: torch.Tensor,
@@ -1123,8 +1221,10 @@ def dropout_bwd(
     # Misc
     def get_cublasLt_version(self) -> int:
         raise NotImplementedError
+
     def get_cudnn_version(self) -> int:
         raise NotImplementedError
+
     def get_num_cublas_streams(self) -> int:
         raise NotImplementedError
 
@@ -1136,6 +1236,7 @@ def thd_read_half_tensor(
         half_idx: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def thd_second_half_lse_correction(
         self,
         lse: torch.Tensor,
@@ -1144,6 +1245,7 @@ def thd_second_half_lse_correction(
         lse_packed: bool,
     ) -> None:
         raise NotImplementedError
+
     def thd_read_second_half_lse(
         self,
         lse: torch.Tensor,
@@ -1152,6 +1254,7 @@ def thd_read_second_half_lse(
         second_half_lse_seqlen: int,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def thd_out_correction(
         self,
         out: torch.Tensor,
@@ -1163,6 +1266,7 @@ def thd_out_correction(
         lse_packed: bool,
     ) -> None:
         raise NotImplementedError
+
     def thd_grad_correction(
         self,
         grad: torch.Tensor,
@@ -1172,6 +1276,7 @@ def thd_grad_correction(
         second_half: str,
     ) -> None:
         raise NotImplementedError
+
     def thd_get_partitioned_indices(
         self,
         cu_seqlens: torch.Tensor,
@@ -1187,12 +1292,14 @@ def init_nvshmem_backend(
         process_group: Any,
     ) -> None:
         raise NotImplementedError
+
     def create_nvshmem_tensor(
         self,
         shape: List[int],
         dtype: torch.dtype,
     ) -> torch.Tensor:
         raise NotImplementedError
+
     def nvshmem_send_on_current_stream(
         self,
         src: torch.Tensor,
@@ -1201,12 +1308,14 @@ def nvshmem_send_on_current_stream(
         signal: torch.Tensor,
     ) -> None:
         raise NotImplementedError
+
     def nvshmem_wait_on_current_stream(
         self,
         signal: torch.Tensor,
         wait_kind: str,
     ) -> None:
         raise NotImplementedError
+
     def nvshmem_finalize(self) -> None:
         raise NotImplementedError
 
@@ -1219,6 +1328,7 @@ def multi_tensor_scale(
         scale: float,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1227,6 +1337,7 @@ def multi_tensor_l2norm(
         per_tensor: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def multi_tensor_unscale_l2norm(
         self,
         chunk_size: int,
@@ -1236,6 +1347,7 @@ def multi_tensor_unscale_l2norm(
         per_tensor: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
+
     def multi_tensor_adam(
         self,
         chunk_size: int,
@@ -1251,6 +1363,7 @@ def multi_tensor_adam(
         weight_decay: float,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_adam_param_remainder(
         self,
         chunk_size: int,
@@ -1266,6 +1379,7 @@ def multi_tensor_adam_param_remainder(
         weight_decay: float,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_adam_fp8(
         self,
         chunk_size: int,
@@ -1282,6 +1396,7 @@ def multi_tensor_adam_fp8(
         fp8_dtype: DType,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_adam_capturable(
         self,
         chunk_size: int,
@@ -1298,6 +1413,7 @@ def multi_tensor_adam_capturable(
         inv_scale: torch.Tensor,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_adam_capturable_master(
         self,
         chunk_size: int,
@@ -1314,6 +1430,7 @@ def multi_tensor_adam_capturable_master(
         inv_scale: torch.Tensor,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_sgd(
         self,
         chunk_size: int,
@@ -1329,6 +1446,7 @@ def multi_tensor_sgd(
         scale: float,
     ) -> None:
         raise NotImplementedError
+
     def multi_tensor_compute_scale_and_scale_inv(
         self,
         chunk_size: int,
@@ -1349,10 +1467,11 @@ def bulk_overlap_ag_with_external_gemm(
     ) -> Any:
         raise NotImplementedError
 
-############## class func #################################
+    ############## class func #################################
     def create_fp8_tensor_meta(self) -> FP8TensorMeta:
         """Create FP8TensorMeta instance."""
         raise NotImplementedError
+
     def create_comm_overlap_helper(
         self,
         world_group: Optional[Any] = None,
@@ -1363,6 +1482,7 @@ def create_comm_overlap_helper(
         Users should use CommOverlapHelper(...) directly.
         """
         raise NotImplementedError
+
     def create_comm_overlap(
         self,
         buffer_shape: List[int],
@@ -1384,6 +1504,7 @@ def create_comm_overlap(
         Users should use CommOverlap(...) directly.
         """
         raise NotImplementedError
+
     def create_comm_overlap_p2p(
         self,
         buffer_shape: List[int],
@@ -1406,9 +1527,11 @@ def create_comm_overlap_p2p(
         Users should use CommOverlapP2P(...) directly.
         """
         raise NotImplementedError
+
     def get_flash_attention_class(self) -> Type["FlashAttentionBase"]:
         raise NotImplementedError
 
+
 ############ Wapper  #################
 class TEFLModule:
     def __init__(self, manager=None):
@@ -1421,6 +1544,7 @@ def __init__(self, manager=None):
         """
         # Import here to avoid circular dependency
         from .manager import get_default_manager
+
         self._manager = manager if manager is not None else get_default_manager()
         # emum
         self.DType = DType
@@ -1447,7 +1571,7 @@ def __getattr__(self, name: str) -> Any:
         """
         Dynamically resolve operators through OpManager.
         """
-        if name.startswith('_'):
+        if name.startswith("_"):
             raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
         # Verify the operator exists before returning the bound call method
@@ -1456,26 +1580,37 @@ def __getattr__(self, name: str) -> Any:
             available_ops = self._manager.registry.list_operators()
             if name not in available_ops:
                 raise AttributeError(
-                    f"Operator '{name}' not found. "
-                    f"Available operators: {available_ops}"
+                    f"Operator '{name}' not found. Available operators: {available_ops}"
                 )
         except RuntimeError as e:
             # Re-raise as AttributeError for better error messages
-            raise AttributeError(
-                f"Error accessing operator '{name}': {e}"
-            ) from e
+            raise AttributeError(f"Error accessing operator '{name}': {e}") from e
 
         # Return a bound call method for this operator
         import functools
+
         return functools.partial(self._manager.call, name)
 
     def __dir__(self):
         module_attrs = [
-            'DType', 'Float8BlockScaleTensorFormat', 'FP8FwdTensors', 'FP8BwdTensors',
-            'FP8TensorMeta', 'NVTE_Activation_Type', 'NVTE_Bias_Type', 'NVTE_Mask_Type',
-            'NVTE_Softmax_Type', 'NVTE_Fused_Attn_Backend', 'NVTE_QKV_Format', 'NVTE_QKV_Layout',
-            'CommOverlapType', 'CommOverlapAlgo', 'CommGemmOverlapRole',
-            'CommOverlapHelper', 'CommOverlap', 'CommOverlapP2P', 
+            "DType",
+            "Float8BlockScaleTensorFormat",
+            "FP8FwdTensors",
+            "FP8BwdTensors",
+            "FP8TensorMeta",
+            "NVTE_Activation_Type",
+            "NVTE_Bias_Type",
+            "NVTE_Mask_Type",
+            "NVTE_Softmax_Type",
+            "NVTE_Fused_Attn_Backend",
+            "NVTE_QKV_Format",
+            "NVTE_QKV_Layout",
+            "CommOverlapType",
+            "CommOverlapAlgo",
+            "CommGemmOverlapRole",
+            "CommOverlapHelper",
+            "CommOverlap",
+            "CommOverlapP2P",
         ]
 
         # Add operator names from OpManager's registry
@@ -1508,12 +1643,12 @@ def flash_attention(
 
         # Prepare initialization parameters
         init_params = {
-            'softmax_scale': softmax_scale,
-            'attention_dropout': attention_dropout,
-            'attention_dropout_ctx': attention_dropout_ctx,
-            'attention_type': attention_type,
-            'layer_number': layer_number,
-            'deterministic': deterministic,
+            "softmax_scale": softmax_scale,
+            "attention_dropout": attention_dropout,
+            "attention_dropout_ctx": attention_dropout_ctx,
+            "attention_type": attention_type,
+            "layer_number": layer_number,
+            "deterministic": deterministic,
         }
 
         # Instantiate the FlashAttention
@@ -1529,10 +1664,12 @@ def __repr__(self) -> str:
         op_count = len(self._manager.registry.list_operators())
         return f"TEFLModule(operators={op_count}, manager={self._manager.__class__.__name__})"
 
+
 # Global singleton instance
 _global_tefl_module: Optional[TEFLModule] = None
 _tefl_module_lock = None
 
+
 def get_tefl_module() -> TEFLModule:
     """
     Get or create the global TEFLModule instance.
@@ -1565,6 +1702,7 @@ def get_tefl_module() -> TEFLModule:
 
     return _global_tefl_module
 
+
 def reset_tefl_module() -> None:
     """
     Reset the global TEFLModule instance.
@@ -1580,11 +1718,13 @@ def reset_tefl_module() -> None:
 
     if _tefl_module_lock is None:
         import threading
+
         _tefl_module_lock = threading.RLock()
 
     with _tefl_module_lock:
         _global_tefl_module = None
 
+
 # Backward compatibility functions
 def get_registry():
     """
@@ -1604,8 +1744,10 @@ def get_registry():
         >>> ops = registry.list_operators()
     """
     from .manager import get_default_manager
+
     return get_default_manager().registry
 
+
 def get_manager():
     """
     Get the global OpManager instance.
@@ -1621,8 +1763,10 @@ def get_manager():
         >>> impl_fn = manager.resolve("rmsnorm_fwd")
     """
     from .manager import get_default_manager
+
     return get_default_manager()
 
+
 def reset_registry() -> None:
     """
     Reset the global OpManager and OpRegistry.
@@ -1632,6 +1776,7 @@ def reset_registry() -> None:
     This function is kept for backward compatibility.
     """
     from .manager import reset_default_manager
+
     reset_default_manager()
     # Also reset the TEFLModule singleton since it depends on OpManager
     reset_tefl_module()
diff --git a/transformer_engine/plugin/core/policy.py b/transformer_engine/plugin/core/policy.py
index 9e4a196c3b..ce1ac9d7e0 100644
--- a/transformer_engine/plugin/core/policy.py
+++ b/transformer_engine/plugin/core/policy.py
@@ -36,6 +36,7 @@ class SelectionPolicy:
         deny_vendors: Set of vendor names to deny
         allow_vendors: Set of vendor names to allow (whitelist)
     """
+
     prefer: str = PREFER_DEFAULT
     strict: bool = False
     per_op_order: Tuple[Tuple[str, Tuple[str, ...]], ...] = field(default_factory=tuple)
@@ -61,9 +62,7 @@ def from_dict(
     ) -> "SelectionPolicy":
         per_op_tuple = tuple()
         if per_op_order:
-            per_op_tuple = tuple(
-                (k, tuple(v)) for k, v in sorted(per_op_order.items())
-            )
+            per_op_tuple = tuple((k, tuple(v)) for k, v in sorted(per_op_order.items()))
 
         return cls(
             prefer=prefer.lower(),
@@ -114,21 +113,21 @@ def fingerprint(self) -> str:
             parts.append(f"deny={','.join(sorted(self.deny_vendors))}")
 
         if self.per_op_order:
-            per_op_str = ";".join(
-                f"{k}={'|'.join(v)}" for k, v in self.per_op_order
-            )
+            per_op_str = ";".join(f"{k}={'|'.join(v)}" for k, v in self.per_op_order)
             parts.append(f"per={per_op_str}")
 
         return ";".join(parts)
 
     def __hash__(self) -> int:
-        return hash((
-            self.prefer,
-            self.strict,
-            self.per_op_order,
-            self.deny_vendors,
-            self.allow_vendors,
-        ))
+        return hash(
+            (
+                self.prefer,
+                self.strict,
+                self.per_op_order,
+                self.deny_vendors,
+                self.allow_vendors,
+            )
+        )
 
 
 class PolicyManager:
@@ -136,7 +135,7 @@ class PolicyManager:
     _lock = threading.Lock()
 
     def __init__(self):
-        if hasattr(self, '_policy_epoch'):
+        if hasattr(self, "_policy_epoch"):
             return
 
         self._policy_epoch = 0
@@ -234,8 +233,10 @@ def _policy_from_env(self) -> SelectionPolicy:
             if te_fl_prefer in VALID_PREFER_VALUES:
                 prefer_str = te_fl_prefer
             else:
-                print(f"[WARNING] Invalid TE_FL_PREFER value: '{te_fl_prefer}'. "
-                      f"Valid values: {', '.join(sorted(VALID_PREFER_VALUES))}")
+                print(
+                    f"[WARNING] Invalid TE_FL_PREFER value: '{te_fl_prefer}'. "
+                    f"Valid values: {', '.join(sorted(VALID_PREFER_VALUES))}"
+                )
 
         # 2. Fall back to TE_FL_PREFER_VENDOR (legacy)
         if prefer_str is None:
diff --git a/transformer_engine/plugin/core/registry.py b/transformer_engine/plugin/core/registry.py
index bd08241b3b..1a4099936d 100644
--- a/transformer_engine/plugin/core/registry.py
+++ b/transformer_engine/plugin/core/registry.py
@@ -14,6 +14,7 @@
 @dataclass
 class OpRegistrySnapshot:
     """Immutable snapshot of operator registry state"""
+
     impls_by_op: Dict[str, List[OpImpl]]
 
 
@@ -67,10 +68,7 @@ def snapshot(self) -> OpRegistrySnapshot:
             OpRegistrySnapshot with all registered implementations
         """
         with self._lock:
-            impls_by_op = {
-                op: list(by_id.values())
-                for op, by_id in self._impls_by_op.items()
-            }
+            impls_by_op = {op: list(by_id.values()) for op, by_id in self._impls_by_op.items()}
         return OpRegistrySnapshot(impls_by_op=impls_by_op)
 
     def get_implementations(self, op_name: str) -> List[OpImpl]:
diff --git a/transformer_engine/plugin/examples/example_intree.py b/transformer_engine/plugin/examples/example_intree.py
index 5c2052bb00..c4badb0ccc 100644
--- a/transformer_engine/plugin/examples/example_intree.py
+++ b/transformer_engine/plugin/examples/example_intree.py
@@ -44,14 +44,16 @@ def my_rmsnorm_fwd(input, weight, eps=1e-5, **kwargs):
 # ============================================================
 registry = OpRegistry()
 
-registry.register_impl(OpImpl(
-    op_name="rmsnorm_fwd",           # Operator name
-    impl_id="vendor.mybackend",      # Implementation ID (unique identifier)
-    kind=BackendImplKind.VENDOR,     # Type: VENDOR / DEFAULT / REFERENCE
-    vendor="mybackend",              # Vendor name
-    fn=my_rmsnorm_fwd,               # Implementation function
-    priority=200,                    # Priority (higher = preferred)
-))
+registry.register_impl(
+    OpImpl(
+        op_name="rmsnorm_fwd",  # Operator name
+        impl_id="vendor.mybackend",  # Implementation ID (unique identifier)
+        kind=BackendImplKind.VENDOR,  # Type: VENDOR / DEFAULT / REFERENCE
+        vendor="mybackend",  # Vendor name
+        fn=my_rmsnorm_fwd,  # Implementation function
+        priority=200,  # Priority (higher = preferred)
+    )
+)
 
 
 # ============================================================
diff --git a/transformer_engine/plugin/examples/example_outtree.py b/transformer_engine/plugin/examples/example_outtree.py
index 92eea892a6..e85339307f 100644
--- a/transformer_engine/plugin/examples/example_outtree.py
+++ b/transformer_engine/plugin/examples/example_outtree.py
@@ -62,14 +62,16 @@ def register(registry):
 
         print("[MyVendorPlugin] Registering operator implementations...")
 
-        registry.register_impl(OpImpl(
-            op_name="rmsnorm_fwd",
-            impl_id="vendor.myvendor",
-            kind=BackendImplKind.VENDOR,
-            vendor="myvendor",
-            fn=my_rmsnorm_fwd,
-            priority=200,
-        ))
+        registry.register_impl(
+            OpImpl(
+                op_name="rmsnorm_fwd",
+                impl_id="vendor.myvendor",
+                kind=BackendImplKind.VENDOR,
+                vendor="myvendor",
+                fn=my_rmsnorm_fwd,
+                priority=200,
+            )
+        )
 
         print("[MyVendorPlugin] Registration complete!")
 
@@ -90,6 +92,7 @@ def register(registry):
 # Step 3: Set environment variables for TE-FL auto-discovery
 # ============================================================
 import os
+
 os.environ["TE_FL_PLUGIN_MODULES"] = "my_vendor_plugin"
 os.environ["TE_FL_PREFER"] = "vendor"  # Prefer vendor backend
 
diff --git a/transformer_engine/plugin/test_utils.py b/transformer_engine/plugin/test_utils.py
index 8ce836e41e..c1462c84d2 100644
--- a/transformer_engine/plugin/test_utils.py
+++ b/transformer_engine/plugin/test_utils.py
@@ -25,7 +25,7 @@ def get_available_backends() -> List[str]:
         impl_ids = set()
         for impl in all_impls:
             # impl_id format: "kind.name" (e.g., "default.flagos", "vendor.cuda")
-            parts = impl.impl_id.split('.', 1)
+            parts = impl.impl_id.split(".", 1)
             if len(parts) == 2:
                 impl_ids.add(parts[1])  # Get the "name" part
             else:
@@ -35,6 +35,7 @@ def get_available_backends() -> List[str]:
     except Exception as e:
         print(f"Warning: Could not load backends: {e}")
         import traceback
+
         traceback.print_exc()
         return []
 
@@ -70,7 +71,10 @@ def _find_impl(self, op_name: str):
             # Try to find implementation matching backend_name
             # Match against impl_id suffix (e.g., "vendor.cuda" matches "cuda")
             for impl in impls:
-                if impl.impl_id.endswith(f".{self.backend_name}") or impl.impl_id == self.backend_name:
+                if (
+                    impl.impl_id.endswith(f".{self.backend_name}")
+                    or impl.impl_id == self.backend_name
+                ):
                     if impl.is_available():
                         return impl
                     else:
@@ -152,7 +156,9 @@ def report(self):
         if self.description:
             print(f"Description: {self.description}")
         print(f"{'='*60}")
-        print(f"Total: {total}, Passed: {self.passed}, Failed: {self.failed}, Skipped: {self.skipped}")
+        print(
+            f"Total: {total}, Passed: {self.passed}, Failed: {self.failed}, Skipped: {self.skipped}"
+        )
         if self.errors:
             print(f"\nErrors:")
             for i, error in enumerate(self.errors, 1):
diff --git a/transformer_engine/plugin/tests/run_all_tests.py b/transformer_engine/plugin/tests/run_all_tests.py
index 07b8f5032e..bfc2dee59d 100644
--- a/transformer_engine/plugin/tests/run_all_tests.py
+++ b/transformer_engine/plugin/tests/run_all_tests.py
@@ -15,9 +15,9 @@
 def main():
     device = "cuda" if torch.cuda.is_available() else "cpu"
 
-    print("\n" + "="*70)
-    print(" "*15 + "TEX Interface Backend Tests")
-    print("="*70)
+    print("\n" + "=" * 70)
+    print(" " * 15 + "TEX Interface Backend Tests")
+    print("=" * 70)
     print(f"Using device: {device}\n")
 
     test_suites = [
@@ -34,9 +34,9 @@ def main():
         success = suite.run_all_tests()
         results.append((suite.name, success))
 
-    print("\n" + "="*70)
-    print(" "*25 + "Test Summary")
-    print("="*70)
+    print("\n" + "=" * 70)
+    print(" " * 25 + "Test Summary")
+    print("=" * 70)
 
     total_passed = sum(1 for _, success in results if success)
     total_tests = len(results)
@@ -45,9 +45,9 @@ def main():
         status = "✓ PASSED" if success else "✗ FAILED"
         print(f"  {name:40s} {status}")
 
-    print("="*70)
+    print("=" * 70)
     print(f"Total: {total_passed}/{total_tests} test suites passed")
-    print("="*70)
+    print("=" * 70)
 
     return 0 if all(success for _, success in results) else 1
 
diff --git a/transformer_engine/plugin/tests/test_activations.py b/transformer_engine/plugin/tests/test_activations.py
index 6bf573b7cc..e73851ac50 100644
--- a/transformer_engine/plugin/tests/test_activations.py
+++ b/transformer_engine/plugin/tests/test_activations.py
@@ -19,8 +19,7 @@
 class ActivationTests(TestCase):
     def __init__(self, device="cpu"):
         super().__init__(
-            "Activation Functions",
-            "Test correctness of all activation functions across backends"
+            "Activation Functions", "Test correctness of all activation functions across backends"
         )
         self.backends = get_available_backends()
         self.reference_backend = "reference"
@@ -28,11 +27,11 @@ def __init__(self, device="cpu"):
 
     # ==================== Reference implementations ====================
     def _get_reference_gelu(self, x):
-        return F.gelu(x, approximate='tanh')
+        return F.gelu(x, approximate="tanh")
 
     def _get_reference_geglu(self, x):
         a, b = x.chunk(2, dim=-1)
-        return F.gelu(a, approximate='tanh') * b
+        return F.gelu(a, approximate="tanh") * b
 
     def _get_reference_qgelu(self, x):
         return x * torch.sigmoid(1.702 * x)
@@ -147,8 +146,11 @@ def test_clamped_swiglu_forward(self, shape=(4, 16)):
             try:
                 output = backend.clamped_swiglu(x, None, 7.0, 1.702)
                 self.assert_close(
-                    output, reference, rtol=1e-4, atol=1e-6,
-                    msg=f"clamped_swiglu forward mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"clamped_swiglu forward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -165,8 +167,11 @@ def _test_activation_forward(self, op_name, x, reference, rtol=1e-4, atol=1e-6):
                 op_fn = getattr(backend, op_name)
                 output = op_fn(x, None)
                 self.assert_close(
-                    output, reference, rtol=rtol, atol=atol,
-                    msg=f"{op_name} forward mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=rtol,
+                    atol=atol,
+                    msg=f"{op_name} forward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -179,7 +184,9 @@ def _test_activation_forward(self, op_name, x, reference, rtol=1e-4, atol=1e-6):
     # ==================== Backward tests ====================
     def test_gelu_backward(self, shape=(4, 8)):
         print(f"\n  Testing GELU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
         y = self._get_reference_gelu(x)
         y.backward(grad_output)
@@ -189,9 +196,14 @@ def test_gelu_backward(self, shape=(4, 8)):
 
     def test_geglu_backward(self, shape=(4, 16)):
         print(f"\n  Testing GEGLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
-                                              dtype=torch.float32, device=self.device)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        grad_output = generate_random_tensor(
+            (shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+            dtype=torch.float32,
+            device=self.device,
+        )
         y = self._get_reference_geglu(x)
         y.backward(grad_output)
         reference_grad = x.grad.clone()
@@ -200,7 +212,9 @@ def test_geglu_backward(self, shape=(4, 16)):
 
     def test_qgelu_backward(self, shape=(4, 8)):
         print(f"\n  Testing QGELU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
         y = self._get_reference_qgelu(x)
         y.backward(grad_output)
@@ -210,9 +224,14 @@ def test_qgelu_backward(self, shape=(4, 8)):
 
     def test_qgeglu_backward(self, shape=(4, 16)):
         print(f"\n  Testing QGEGLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
-                                              dtype=torch.float32, device=self.device)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        grad_output = generate_random_tensor(
+            (shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+            dtype=torch.float32,
+            device=self.device,
+        )
         y = self._get_reference_qgeglu(x)
         y.backward(grad_output)
         reference_grad = x.grad.clone()
@@ -221,7 +240,9 @@ def test_qgeglu_backward(self, shape=(4, 16)):
 
     def test_relu_backward(self, shape=(4, 8)):
         print(f"\n  Testing ReLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
         y = self._get_reference_relu(x)
         y.backward(grad_output)
@@ -231,9 +252,14 @@ def test_relu_backward(self, shape=(4, 8)):
 
     def test_reglu_backward(self, shape=(4, 16)):
         print(f"\n  Testing ReGLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
-                                              dtype=torch.float32, device=self.device)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        grad_output = generate_random_tensor(
+            (shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+            dtype=torch.float32,
+            device=self.device,
+        )
         y = self._get_reference_reglu(x)
         y.backward(grad_output)
         reference_grad = x.grad.clone()
@@ -242,7 +268,9 @@ def test_reglu_backward(self, shape=(4, 16)):
 
     def test_srelu_backward(self, shape=(4, 8)):
         print(f"\n  Testing SReLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
         y = self._get_reference_srelu(x)
         y.backward(grad_output)
@@ -252,9 +280,14 @@ def test_srelu_backward(self, shape=(4, 8)):
 
     def test_sreglu_backward(self, shape=(4, 16)):
         print(f"\n  Testing SReGLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
-                                              dtype=torch.float32, device=self.device)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        grad_output = generate_random_tensor(
+            (shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+            dtype=torch.float32,
+            device=self.device,
+        )
         y = self._get_reference_sreglu(x)
         y.backward(grad_output)
         reference_grad = x.grad.clone()
@@ -263,7 +296,9 @@ def test_sreglu_backward(self, shape=(4, 16)):
 
     def test_silu_backward(self, shape=(4, 8)):
         print(f"\n  Testing SiLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
         y = self._get_reference_silu(x)
         y.backward(grad_output)
@@ -273,24 +308,34 @@ def test_silu_backward(self, shape=(4, 8)):
 
     def test_swiglu_backward(self, shape=(4, 16)):
         print(f"\n  Testing SwiGLU backward with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        grad_output = generate_random_tensor((shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
-                                              dtype=torch.float32, device=self.device)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        grad_output = generate_random_tensor(
+            (shape[0], shape[1] // 2) if len(shape) == 2 else (*shape[:-1], shape[-1] // 2),
+            dtype=torch.float32,
+            device=self.device,
+        )
         y = self._get_reference_swiglu(x)
         y.backward(grad_output)
         reference_grad = x.grad.clone()
         x.grad = None
         self._test_activation_backward("dswiglu", x, grad_output, reference_grad)
 
-    def _test_activation_backward(self, op_name, x, grad_output, reference_grad, rtol=1e-4, atol=1e-6):
+    def _test_activation_backward(
+        self, op_name, x, grad_output, reference_grad, rtol=1e-4, atol=1e-6
+    ):
         for backend_name in self.backends:
             backend = get_backend(backend_name)
             try:
                 op_fn = getattr(backend, op_name)
                 grad_input = op_fn(grad_output, x.detach(), None)
                 self.assert_close(
-                    grad_input, reference_grad, rtol=rtol, atol=atol,
-                    msg=f"{op_name} backward mismatch for {backend_name}"
+                    grad_input,
+                    reference_grad,
+                    rtol=rtol,
+                    atol=atol,
+                    msg=f"{op_name} backward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -303,7 +348,9 @@ def _test_activation_backward(self, op_name, x, grad_output, reference_grad, rto
     # ==================== Bias + backward tests ====================
     def test_dbias_dgelu(self, shape=(4, 8)):
         print(f"\n  Testing dbias_dgelu with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
         # Reference: compute dgelu and sum for bias grad
@@ -318,12 +365,18 @@ def test_dbias_dgelu(self, shape=(4, 8)):
             try:
                 grad_input, grad_bias = backend.dbias_dgelu(grad_output, x.detach(), None)
                 self.assert_close(
-                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dgelu grad_input mismatch for {backend_name}"
+                    grad_input,
+                    ref_grad_input,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dgelu grad_input mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dgelu grad_bias mismatch for {backend_name}"
+                    grad_bias,
+                    ref_grad_bias,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dgelu grad_bias mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -343,7 +396,9 @@ def test_dbias_dgelu(self, shape=(4, 8)):
 
     def test_dbias_dsilu(self, shape=(4, 8)):
         print(f"\n  Testing dbias_dsilu with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
         y = self._get_reference_silu(x)
@@ -357,12 +412,18 @@ def test_dbias_dsilu(self, shape=(4, 8)):
             try:
                 grad_input, grad_bias = backend.dbias_dsilu(grad_output, x.detach(), None)
                 self.assert_close(
-                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dsilu grad_input mismatch for {backend_name}"
+                    grad_input,
+                    ref_grad_input,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dsilu grad_input mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dsilu grad_bias mismatch for {backend_name}"
+                    grad_bias,
+                    ref_grad_bias,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dsilu grad_bias mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -382,7 +443,9 @@ def test_dbias_dsilu(self, shape=(4, 8)):
 
     def test_dbias_drelu(self, shape=(4, 8)):
         print(f"\n  Testing dbias_drelu with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
         y = self._get_reference_relu(x)
@@ -396,12 +459,18 @@ def test_dbias_drelu(self, shape=(4, 8)):
             try:
                 grad_input, grad_bias = backend.dbias_drelu(grad_output, x.detach(), None)
                 self.assert_close(
-                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_drelu grad_input mismatch for {backend_name}"
+                    grad_input,
+                    ref_grad_input,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_drelu grad_input mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_drelu grad_bias mismatch for {backend_name}"
+                    grad_bias,
+                    ref_grad_bias,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_drelu grad_bias mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -421,7 +490,9 @@ def test_dbias_drelu(self, shape=(4, 8)):
 
     def test_dbias_dqgelu(self, shape=(4, 8)):
         print(f"\n  Testing dbias_dqgelu with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
         y = self._get_reference_qgelu(x)
@@ -435,12 +506,18 @@ def test_dbias_dqgelu(self, shape=(4, 8)):
             try:
                 grad_input, grad_bias = backend.dbias_dqgelu(grad_output, x.detach(), None)
                 self.assert_close(
-                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dqgelu grad_input mismatch for {backend_name}"
+                    grad_input,
+                    ref_grad_input,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dqgelu grad_input mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dqgelu grad_bias mismatch for {backend_name}"
+                    grad_bias,
+                    ref_grad_bias,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dqgelu grad_bias mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -460,7 +537,9 @@ def test_dbias_dqgelu(self, shape=(4, 8)):
 
     def test_dbias_dsrelu(self, shape=(4, 8)):
         print(f"\n  Testing dbias_dsrelu with shape {shape}")
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
         y = self._get_reference_srelu(x)
@@ -474,12 +553,18 @@ def test_dbias_dsrelu(self, shape=(4, 8)):
             try:
                 grad_input, grad_bias = backend.dbias_dsrelu(grad_output, x.detach(), None)
                 self.assert_close(
-                    grad_input, ref_grad_input, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dsrelu grad_input mismatch for {backend_name}"
+                    grad_input,
+                    ref_grad_input,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dsrelu grad_input mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-6,
-                    msg=f"dbias_dsrelu grad_bias mismatch for {backend_name}"
+                    grad_bias,
+                    ref_grad_bias,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"dbias_dsrelu grad_bias mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -498,9 +583,9 @@ def test_dbias_dsrelu(self, shape=(4, 8)):
                 print(f"    ✗ {backend_name}: {e}")
 
     def run_all_tests(self):
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Testing Activation Functions")
-        print("="*60)
+        print("=" * 60)
         print(f"Available backends: {', '.join(self.backends)}")
 
         shapes = [(4, 8), (8, 16), (2, 4, 8)]
diff --git a/transformer_engine/plugin/tests/test_flash_attention.py b/transformer_engine/plugin/tests/test_flash_attention.py
index 4dcb83d36b..3a3f3be24f 100644
--- a/transformer_engine/plugin/tests/test_flash_attention.py
+++ b/transformer_engine/plugin/tests/test_flash_attention.py
@@ -17,8 +17,7 @@
 class FlashAttentionTests(TestCase):
     def __init__(self, device="cpu"):
         super().__init__(
-            "Flash Attention",
-            "Test correctness of Flash Attention implementation across backends"
+            "Flash Attention", "Test correctness of Flash Attention implementation across backends"
         )
         self.backends = get_available_backends()
         self.device = device
@@ -51,8 +50,7 @@ def _reference_attention(
 
         if is_causal:
             causal_mask = torch.triu(
-                torch.full((L, S), float('-inf'), dtype=q.dtype, device=q.device),
-                diagonal=1
+                torch.full((L, S), float("-inf"), dtype=q.dtype, device=q.device), diagonal=1
             )
             attn_weight = attn_weight + causal_mask
 
@@ -68,30 +66,31 @@ def _reference_attention(
         # Convert bhsd back to sbhd
         return out.permute(2, 0, 1, 3)  # [seq, batch, heads, dim]
 
-    def test_flash_attention_forward_basic(self, seq_len=16, batch_size=2, num_heads=4, head_dim=32):
+    def test_flash_attention_forward_basic(
+        self, seq_len=16, batch_size=2, num_heads=4, head_dim=32
+    ):
         """Test basic flash attention forward pass with sbhd layout and bf16"""
-        print(f"\n  Testing Flash Attention forward sbhd bf16 (seq={seq_len}, batch={batch_size}, heads={num_heads}, dim={head_dim})")
+        print(
+            f"\n  Testing Flash Attention forward sbhd bf16 (seq={seq_len}, batch={batch_size},"
+            f" heads={num_heads}, dim={head_dim})"
+        )
 
         # Shape: (seq_len, batch, num_heads, head_dim) - sbhd layout
         query = generate_random_tensor(
-            (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads, head_dim), dtype=torch.bfloat16, device=self.device
         )
         key = generate_random_tensor(
-            (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads, head_dim), dtype=torch.bfloat16, device=self.device
         )
         value = generate_random_tensor(
-            (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads, head_dim), dtype=torch.bfloat16, device=self.device
         )
 
         scale = 1.0 / math.sqrt(head_dim)
 
         # Reference attention (compute in float32 for accuracy)
         reference = self._reference_attention(
-            query.float(), key.float(), value.float(),
-            scale=scale, is_causal=False
+            query.float(), key.float(), value.float(), scale=scale, is_causal=False
         ).to(torch.bfloat16)
 
         for backend_name in self.backends:
@@ -122,14 +121,20 @@ def test_flash_attention_forward_basic(self, seq_len=16, batch_size=2, num_heads
                     # Try to reshape reference for comparison
                     reference_flat = reference.contiguous().reshape(seq_len, batch_size, -1)
                     self.assert_close(
-                        output.float(), reference_flat.float(), rtol=1e-2, atol=1e-2,
-                        msg=f"Flash Attention forward mismatch for {backend_name}"
+                        output.float(),
+                        reference_flat.float(),
+                        rtol=1e-2,
+                        atol=1e-2,
+                        msg=f"Flash Attention forward mismatch for {backend_name}",
                     )
                 else:
                     reference_flat = reference.contiguous().reshape(seq_len, batch_size, -1)
                     self.assert_close(
-                        output.float(), reference_flat.float(), rtol=1e-2, atol=1e-2,
-                        msg=f"Flash Attention forward mismatch for {backend_name}"
+                        output.float(),
+                        reference_flat.float(),
+                        rtol=1e-2,
+                        atol=1e-2,
+                        msg=f"Flash Attention forward mismatch for {backend_name}",
                     )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -139,31 +144,33 @@ def test_flash_attention_forward_basic(self, seq_len=16, batch_size=2, num_heads
                 self.failed += 1
                 print(f"    ✗ {backend_name}: {e}")
                 import traceback
+
                 traceback.print_exc()
 
-    def test_flash_attention_forward_causal(self, seq_len=16, batch_size=2, num_heads=4, head_dim=32):
+    def test_flash_attention_forward_causal(
+        self, seq_len=16, batch_size=2, num_heads=4, head_dim=32
+    ):
         """Test flash attention forward pass with causal mask"""
-        print(f"\n  Testing Flash Attention forward causal sbhd bf16 (seq={seq_len}, batch={batch_size}, heads={num_heads}, dim={head_dim})")
+        print(
+            f"\n  Testing Flash Attention forward causal sbhd bf16 (seq={seq_len},"
+            f" batch={batch_size}, heads={num_heads}, dim={head_dim})"
+        )
 
         query = generate_random_tensor(
-            (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads, head_dim), dtype=torch.bfloat16, device=self.device
         )
         key = generate_random_tensor(
-            (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads, head_dim), dtype=torch.bfloat16, device=self.device
         )
         value = generate_random_tensor(
-            (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads, head_dim), dtype=torch.bfloat16, device=self.device
         )
 
         scale = 1.0 / math.sqrt(head_dim)
 
         # Reference attention with causal mask
         reference = self._reference_attention(
-            query.float(), key.float(), value.float(),
-            scale=scale, is_causal=True
+            query.float(), key.float(), value.float(), scale=scale, is_causal=True
         ).to(torch.bfloat16)
 
         for backend_name in self.backends:
@@ -189,8 +196,11 @@ def test_flash_attention_forward_causal(self, seq_len=16, batch_size=2, num_head
 
                 reference_flat = reference.contiguous().reshape(seq_len, batch_size, -1)
                 self.assert_close(
-                    output.float(), reference_flat.float(), rtol=1e-2, atol=1e-2,
-                    msg=f"Flash Attention forward causal mismatch for {backend_name}"
+                    output.float(),
+                    reference_flat.float(),
+                    rtol=1e-2,
+                    atol=1e-2,
+                    msg=f"Flash Attention forward causal mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -200,6 +210,7 @@ def test_flash_attention_forward_causal(self, seq_len=16, batch_size=2, num_head
                 self.failed += 1
                 print(f"    ✗ {backend_name}: {e}")
                 import traceback
+
                 traceback.print_exc()
 
     def test_flash_attention_backward(self, seq_len=16, batch_size=2, num_heads=4, head_dim=32):
@@ -207,24 +218,32 @@ def test_flash_attention_backward(self, seq_len=16, batch_size=2, num_heads=4, h
 
         Note: FlagGems backward currently only supports causal attention.
         """
-        print(f"\n  Testing Flash Attention backward causal sbhd bf16 (seq={seq_len}, batch={batch_size}, heads={num_heads}, dim={head_dim})")
+        print(
+            f"\n  Testing Flash Attention backward causal sbhd bf16 (seq={seq_len},"
+            f" batch={batch_size}, heads={num_heads}, dim={head_dim})"
+        )
 
         query = generate_random_tensor(
             (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device, requires_grad=True
+            dtype=torch.bfloat16,
+            device=self.device,
+            requires_grad=True,
         )
         key = generate_random_tensor(
             (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device, requires_grad=True
+            dtype=torch.bfloat16,
+            device=self.device,
+            requires_grad=True,
         )
         value = generate_random_tensor(
             (seq_len, batch_size, num_heads, head_dim),
-            dtype=torch.bfloat16, device=self.device, requires_grad=True
+            dtype=torch.bfloat16,
+            device=self.device,
+            requires_grad=True,
         )
         # grad_output shape matches output: sb(h*d)
         grad_output = generate_random_tensor(
-            (seq_len, batch_size, num_heads * head_dim),
-            dtype=torch.bfloat16, device=self.device
+            (seq_len, batch_size, num_heads * head_dim), dtype=torch.bfloat16, device=self.device
         )
 
         scale = 1.0 / math.sqrt(head_dim)
@@ -235,7 +254,9 @@ def test_flash_attention_backward(self, seq_len=16, batch_size=2, num_heads=4, h
         key_f32 = key.float().detach().requires_grad_(True)
         value_f32 = value.float().detach().requires_grad_(True)
 
-        ref_output = self._reference_attention(query_f32, key_f32, value_f32, scale=scale, is_causal=True)
+        ref_output = self._reference_attention(
+            query_f32, key_f32, value_f32, scale=scale, is_causal=True
+        )
         ref_output_flat = ref_output.contiguous().reshape(seq_len, batch_size, -1)
         ref_output_flat.backward(grad_output.float())
         ref_grad_q = query_f32.grad.clone().to(torch.bfloat16)
@@ -273,16 +294,25 @@ def test_flash_attention_backward(self, seq_len=16, batch_size=2, num_heads=4, h
 
                 # bf16 backward has higher numerical error due to accumulated precision loss
                 self.assert_close(
-                    q_copy.grad.float(), ref_grad_q.float(), rtol=2e-2, atol=2e-2,
-                    msg=f"Flash Attention backward grad_q mismatch for {backend_name}"
+                    q_copy.grad.float(),
+                    ref_grad_q.float(),
+                    rtol=2e-2,
+                    atol=2e-2,
+                    msg=f"Flash Attention backward grad_q mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    k_copy.grad.float(), ref_grad_k.float(), rtol=2e-2, atol=2e-2,
-                    msg=f"Flash Attention backward grad_k mismatch for {backend_name}"
+                    k_copy.grad.float(),
+                    ref_grad_k.float(),
+                    rtol=2e-2,
+                    atol=2e-2,
+                    msg=f"Flash Attention backward grad_k mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    v_copy.grad.float(), ref_grad_v.float(), rtol=2e-2, atol=2e-2,
-                    msg=f"Flash Attention backward grad_v mismatch for {backend_name}"
+                    v_copy.grad.float(),
+                    ref_grad_v.float(),
+                    rtol=2e-2,
+                    atol=2e-2,
+                    msg=f"Flash Attention backward grad_v mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -292,12 +322,13 @@ def test_flash_attention_backward(self, seq_len=16, batch_size=2, num_heads=4, h
                 self.failed += 1
                 print(f"    ✗ {backend_name}: {e}")
                 import traceback
+
                 traceback.print_exc()
 
     def run_all_tests(self):
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Testing Flash Attention")
-        print("="*60)
+        print("=" * 60)
         print(f"Available backends: {', '.join(self.backends)}")
 
         # Basic forward tests with sbhd layout and bf16
diff --git a/transformer_engine/plugin/tests/test_normalization.py b/transformer_engine/plugin/tests/test_normalization.py
index 1083c8b02c..eb2dea35cc 100644
--- a/transformer_engine/plugin/tests/test_normalization.py
+++ b/transformer_engine/plugin/tests/test_normalization.py
@@ -19,8 +19,7 @@
 class NormalizationTests(TestCase):
     def __init__(self, device="cpu"):
         super().__init__(
-            "Normalization Functions",
-            "Test correctness of LayerNorm and RMSNorm across backends"
+            "Normalization Functions", "Test correctness of LayerNorm and RMSNorm across backends"
         )
         self.backends = get_available_backends()
         self.eps = 1e-5
@@ -35,7 +34,7 @@ def _reference_layernorm_forward(self, x, weight, bias, eps):
         return output, mean.squeeze(-1), rsigma.squeeze(-1)
 
     def _reference_rmsnorm_forward(self, x, weight, eps):
-        var = (x ** 2).mean(dim=-1, keepdim=True)
+        var = (x**2).mean(dim=-1, keepdim=True)
         rsigma = torch.rsqrt(var + eps)
         normalized = x * rsigma
         output = normalized * weight
@@ -57,20 +56,28 @@ def test_layernorm_forward(self, shape=(2, 4, 8)):
             backend = get_backend(backend_name)
             try:
                 output, mean, rsigma = backend.layernorm_fwd(
-                    x, weight, bias, self.eps,
-                    None, None, DType.kFloat32, 0, False
+                    x, weight, bias, self.eps, None, None, DType.kFloat32, 0, False
                 )
                 self.assert_close(
-                    output, ref_output, rtol=1e-5, atol=1e-7,
-                    msg=f"LayerNorm forward output mismatch for {backend_name}"
+                    output,
+                    ref_output,
+                    rtol=1e-5,
+                    atol=1e-7,
+                    msg=f"LayerNorm forward output mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    mean, ref_mean, rtol=1e-5, atol=1e-7,
-                    msg=f"LayerNorm forward mean mismatch for {backend_name}"
+                    mean,
+                    ref_mean,
+                    rtol=1e-5,
+                    atol=1e-7,
+                    msg=f"LayerNorm forward mean mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    rsigma, ref_rsigma, rtol=1e-4, atol=1e-6,
-                    msg=f"LayerNorm forward rsigma mismatch for {backend_name}"
+                    rsigma,
+                    ref_rsigma,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"LayerNorm forward rsigma mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -84,8 +91,12 @@ def test_layernorm_backward(self, shape=(2, 4, 8)):
         print(f"\n  Testing LayerNorm backward with shape {shape}")
 
         hidden_size = shape[-1]
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        weight = torch.ones(hidden_size, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        weight = torch.ones(
+            hidden_size, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         bias = torch.zeros(hidden_size, dtype=torch.float32, device=self.device, requires_grad=True)
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
@@ -106,21 +117,29 @@ def test_layernorm_backward(self, shape=(2, 4, 8)):
                 weight_copy = weight.detach()
 
                 grad_x, grad_weight, grad_bias = backend.layernorm_bwd(
-                    grad_output, x_copy, mean.detach(), rsigma.detach(),
-                    weight_copy, 0, False
+                    grad_output, x_copy, mean.detach(), rsigma.detach(), weight_copy, 0, False
                 )
 
                 self.assert_close(
-                    grad_x, ref_grad_x, rtol=1e-4, atol=1e-6,
-                    msg=f"LayerNorm backward grad_x mismatch for {backend_name}"
+                    grad_x,
+                    ref_grad_x,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"LayerNorm backward grad_x mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_weight, ref_grad_weight, rtol=1e-4, atol=1e-6,
-                    msg=f"LayerNorm backward grad_weight mismatch for {backend_name}"
+                    grad_weight,
+                    ref_grad_weight,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"LayerNorm backward grad_weight mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_bias, ref_grad_bias, rtol=1e-4, atol=1e-5,
-                    msg=f"LayerNorm backward grad_bias mismatch for {backend_name}"
+                    grad_bias,
+                    ref_grad_bias,
+                    rtol=1e-4,
+                    atol=1e-5,
+                    msg=f"LayerNorm backward grad_bias mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -143,16 +162,21 @@ def test_rmsnorm_forward(self, shape=(2, 4, 8)):
             backend = get_backend(backend_name)
             try:
                 output, _, rsigma = backend.rmsnorm_fwd(
-                    x, weight, self.eps,
-                    None, None, DType.kFloat32, 0, False
+                    x, weight, self.eps, None, None, DType.kFloat32, 0, False
                 )
                 self.assert_close(
-                    output, ref_output, rtol=1e-5, atol=1e-7,
-                    msg=f"RMSNorm forward output mismatch for {backend_name}"
+                    output,
+                    ref_output,
+                    rtol=1e-5,
+                    atol=1e-7,
+                    msg=f"RMSNorm forward output mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    rsigma, ref_rsigma, rtol=1e-4, atol=1e-6,
-                    msg=f"RMSNorm forward rsigma mismatch for {backend_name}"
+                    rsigma,
+                    ref_rsigma,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"RMSNorm forward rsigma mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -166,8 +190,12 @@ def test_rmsnorm_backward(self, shape=(2, 4, 8)):
         print(f"\n  Testing RMSNorm backward with shape {shape}")
 
         hidden_size = shape[-1]
-        x = generate_random_tensor(shape, dtype=torch.float32, device=self.device, requires_grad=True)
-        weight = torch.ones(hidden_size, dtype=torch.float32, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.float32, device=self.device, requires_grad=True
+        )
+        weight = torch.ones(
+            hidden_size, dtype=torch.float32, device=self.device, requires_grad=True
+        )
         grad_output = generate_random_tensor(shape, dtype=torch.float32, device=self.device)
 
         output, _, rsigma = self._reference_rmsnorm_forward(x, weight, self.eps)
@@ -185,17 +213,22 @@ def test_rmsnorm_backward(self, shape=(2, 4, 8)):
                 weight_copy = weight.detach()
 
                 grad_x, grad_weight = backend.rmsnorm_bwd(
-                    grad_output, x_copy, rsigma.detach(),
-                    weight_copy, 0, False
+                    grad_output, x_copy, rsigma.detach(), weight_copy, 0, False
                 )
 
                 self.assert_close(
-                    grad_x, ref_grad_x, rtol=1e-4, atol=1e-6,
-                    msg=f"RMSNorm backward grad_x mismatch for {backend_name}"
+                    grad_x,
+                    ref_grad_x,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"RMSNorm backward grad_x mismatch for {backend_name}",
                 )
                 self.assert_close(
-                    grad_weight, ref_grad_weight, rtol=1e-4, atol=1e-6,
-                    msg=f"RMSNorm backward grad_weight mismatch for {backend_name}"
+                    grad_weight,
+                    ref_grad_weight,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"RMSNorm backward grad_weight mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -206,9 +239,9 @@ def test_rmsnorm_backward(self, shape=(2, 4, 8)):
                 print(f"    ✗ {backend_name}: {e}")
 
     def run_all_tests(self):
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Testing Normalization Functions")
-        print("="*60)
+        print("=" * 60)
         print(f"Available backends: {', '.join(self.backends)}")
 
         shapes = [
diff --git a/transformer_engine/plugin/tests/test_operations.py b/transformer_engine/plugin/tests/test_operations.py
index 0ebe470e91..1e03dc4692 100644
--- a/transformer_engine/plugin/tests/test_operations.py
+++ b/transformer_engine/plugin/tests/test_operations.py
@@ -20,7 +20,7 @@ class OperationsTests(TestCase):
     def __init__(self, device="cpu"):
         super().__init__(
             "Operations (GEMM, Softmax, Dropout)",
-            "Test correctness of GEMM, Softmax, and Dropout operations"
+            "Test correctness of GEMM, Softmax, and Dropout operations",
         )
         self.backends = get_available_backends()
         self.device = device
@@ -39,15 +39,30 @@ def test_gemm_basic(self, M=32, N=64, K=48):
                 workspace = torch.empty(1024, dtype=torch.uint8, device=self.device)
 
                 output, _, _, _ = backend.generic_gemm(
-                    A, False, B, False, D,
-                    None, DType.kFloat32, None, DType.kFloat32,
-                    False, None, False,
-                    workspace, 1024, False, False
+                    A,
+                    False,
+                    B,
+                    False,
+                    D,
+                    None,
+                    DType.kFloat32,
+                    None,
+                    DType.kFloat32,
+                    False,
+                    None,
+                    False,
+                    workspace,
+                    1024,
+                    False,
+                    False,
                 )
 
                 self.assert_close(
-                    output, reference, rtol=5e-2, atol=1e-2,
-                    msg=f"GEMM output mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=5e-2,
+                    atol=1e-2,
+                    msg=f"GEMM output mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -71,15 +86,30 @@ def test_gemm_transpose_a(self, M=32, N=64, K=48):
                 workspace = torch.empty(1024, dtype=torch.uint8, device=self.device)
 
                 output, _, _, _ = backend.generic_gemm(
-                    A, True, B, False, D,
-                    None, DType.kFloat32, None, DType.kFloat32,
-                    False, None, False,
-                    workspace, 1024, False, False
+                    A,
+                    True,
+                    B,
+                    False,
+                    D,
+                    None,
+                    DType.kFloat32,
+                    None,
+                    DType.kFloat32,
+                    False,
+                    None,
+                    False,
+                    workspace,
+                    1024,
+                    False,
+                    False,
                 )
 
                 self.assert_close(
-                    output, reference, rtol=5e-2, atol=1e-2,
-                    msg=f"GEMM transpose A mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=5e-2,
+                    atol=1e-2,
+                    msg=f"GEMM transpose A mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -103,15 +133,30 @@ def test_gemm_3d(self, B=2, M=16, N=32, K=24):
                 workspace = torch.empty(1024, dtype=torch.uint8, device=self.device)
 
                 output, _, _, _ = backend.generic_gemm(
-                    B_mat, False, A, False, D,
-                    None, DType.kFloat32, None, DType.kFloat32,
-                    False, None, False,
-                    workspace, 1024, False, False
+                    B_mat,
+                    False,
+                    A,
+                    False,
+                    D,
+                    None,
+                    DType.kFloat32,
+                    None,
+                    DType.kFloat32,
+                    False,
+                    None,
+                    False,
+                    workspace,
+                    1024,
+                    False,
+                    False,
                 )
 
                 self.assert_close(
-                    output, reference, rtol=5e-2, atol=1e-2,
-                    msg=f"3D GEMM mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=5e-2,
+                    atol=1e-2,
+                    msg=f"3D GEMM mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -133,8 +178,11 @@ def test_scaled_softmax(self, shape=(2, 4, 8, 16)):
             try:
                 output = backend.scaled_softmax_forward(x, scale)
                 self.assert_close(
-                    output, reference, rtol=1e-2, atol=1e-3,
-                    msg=f"Scaled softmax mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=1e-2,
+                    atol=1e-3,
+                    msg=f"Scaled softmax mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -152,8 +200,8 @@ def test_causal_masked_softmax(self, shape=(8, 16, 16)):
         seq_len = shape[-1]
 
         causal_mask = torch.triu(
-            torch.full((seq_len, seq_len), float('-inf'), dtype=x.dtype, device=self.device),
-            diagonal=1
+            torch.full((seq_len, seq_len), float("-inf"), dtype=x.dtype, device=self.device),
+            diagonal=1,
         )
         reference = F.softmax(x.float() * scale + causal_mask.float(), dim=-1).to(x.dtype)
 
@@ -162,8 +210,11 @@ def test_causal_masked_softmax(self, shape=(8, 16, 16)):
             try:
                 output = backend.scaled_upper_triang_masked_softmax_forward(x, scale)
                 self.assert_close(
-                    output, reference, rtol=1e-2, atol=1e-3,
-                    msg=f"Causal masked softmax mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=1e-2,
+                    atol=1e-3,
+                    msg=f"Causal masked softmax mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -189,11 +240,14 @@ def test_dropout(self, shape=(4, 8, 16)):
                 nonzero_ratio = num_nonzero / total_elements
                 expected_ratio = 1.0 - dropout_prob
 
-                assert abs(nonzero_ratio - expected_ratio) < 0.2, \
-                    f"Dropout ratio mismatch for {backend_name}: {nonzero_ratio:.3f} vs {expected_ratio:.3f}"
+                assert abs(nonzero_ratio - expected_ratio) < 0.2, (
+                    f"Dropout ratio mismatch for {backend_name}: {nonzero_ratio:.3f} vs"
+                    f" {expected_ratio:.3f}"
+                )
 
-                assert torch.all(output[output == 0] == 0), \
-                    f"Dropped elements should be zero for {backend_name}"
+                assert torch.all(
+                    output[output == 0] == 0
+                ), f"Dropped elements should be zero for {backend_name}"
 
                 expected_scale = 1.0 / (1.0 - dropout_prob)
                 non_zero_output = output[output != 0]
@@ -201,18 +255,23 @@ def test_dropout(self, shape=(4, 8, 16)):
 
                 if len(non_zero_output) > 0:
                     self.assert_close(
-                        non_zero_output, non_zero_input * expected_scale,
-                        rtol=1e-2, atol=1e-3,
-                        msg=f"Dropout scaling mismatch for {backend_name}"
+                        non_zero_output,
+                        non_zero_input * expected_scale,
+                        rtol=1e-2,
+                        atol=1e-3,
+                        msg=f"Dropout scaling mismatch for {backend_name}",
                     )
 
-                grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+                grad_output = generate_random_tensor(
+                    shape, dtype=torch.bfloat16, device=self.device
+                )
                 grad_input = backend.dropout_bwd(grad_output, mask, dropout_prob, None)
 
-                grad_nonzero_mask = (grad_input != 0)
-                output_nonzero_mask = (output != 0)
-                assert torch.all(grad_nonzero_mask == output_nonzero_mask), \
-                    f"Dropout backward sparsity mismatch for {backend_name}"
+                grad_nonzero_mask = grad_input != 0
+                output_nonzero_mask = output != 0
+                assert torch.all(
+                    grad_nonzero_mask == output_nonzero_mask
+                ), f"Dropout backward sparsity mismatch for {backend_name}"
 
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -223,9 +282,9 @@ def test_dropout(self, shape=(4, 8, 16)):
                 print(f"    ✗ {backend_name}: {e}")
 
     def run_all_tests(self):
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Testing Operations (GEMM, Softmax, Dropout)")
-        print("="*60)
+        print("=" * 60)
         print(f"Available backends: {', '.join(self.backends)}")
 
         self.test_gemm_basic(M=32, N=64, K=48)
diff --git a/transformer_engine/plugin/tests/test_optimizer.py b/transformer_engine/plugin/tests/test_optimizer.py
index 905c7ebbe2..75c072e308 100644
--- a/transformer_engine/plugin/tests/test_optimizer.py
+++ b/transformer_engine/plugin/tests/test_optimizer.py
@@ -17,7 +17,7 @@ class OptimizerTests(TestCase):
     def __init__(self, device="cpu"):
         super().__init__(
             "Optimizer Operations",
-            "Test correctness of multi_tensor optimizer operations across backends"
+            "Test correctness of multi_tensor optimizer operations across backends",
         )
         self.backends = get_available_backends()
         self.device = device
@@ -39,8 +39,10 @@ def test_multi_tensor_scale(self, num_tensors=4, shape=(64, 128)):
             backend = get_backend(backend_name)
             try:
                 # Create input tensors
-                input_tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                                 for _ in range(num_tensors)]
+                input_tensors = [
+                    generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
                 # Create output tensors (will be filled by the function)
                 output_tensors = [torch.empty_like(t) for t in input_tensors]
                 # Create reference tensors
@@ -52,14 +54,17 @@ def test_multi_tensor_scale(self, num_tensors=4, shape=(64, 128)):
                     chunk_size=2048,
                     noop_flag=noop_flag,
                     tensor_lists=[input_tensors, output_tensors],
-                    scale=scale
+                    scale=scale,
                 )
 
                 # Compare results
                 for i, (output, reference) in enumerate(zip(output_tensors, ref_tensors)):
                     self.assert_close(
-                        output, reference, rtol=1e-5, atol=1e-7,
-                        msg=f"multi_tensor_scale tensor {i} mismatch for {backend_name}"
+                        output,
+                        reference,
+                        rtol=1e-5,
+                        atol=1e-7,
+                        msg=f"multi_tensor_scale tensor {i} mismatch for {backend_name}",
                     )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -75,8 +80,10 @@ def test_multi_tensor_l2norm(self, num_tensors=4, shape=(64, 128)):
         for backend_name in self.backends:
             backend = get_backend(backend_name)
             try:
-                tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                           for _ in range(num_tensors)]
+                tensors = [
+                    generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
 
                 # Reference computation
                 ref_norm = self._reference_multi_tensor_l2norm(tensors, per_tensor=False)
@@ -84,10 +91,7 @@ def test_multi_tensor_l2norm(self, num_tensors=4, shape=(64, 128)):
                 # Backend computation
                 noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
                 output_norm = backend.multi_tensor_l2norm(
-                    chunk_size=2048,
-                    noop_flag=noop_flag,
-                    tensor_lists=[tensors],
-                    per_tensor=False
+                    chunk_size=2048, noop_flag=noop_flag, tensor_lists=[tensors], per_tensor=False
                 )
 
                 # CUDA backend returns tuple (norm, per_tensor_norms), extract the first element
@@ -95,8 +99,11 @@ def test_multi_tensor_l2norm(self, num_tensors=4, shape=(64, 128)):
                     output_norm = output_norm[0]
 
                 self.assert_close(
-                    output_norm, ref_norm, rtol=1e-4, atol=1e-6,
-                    msg=f"multi_tensor_l2norm total norm mismatch for {backend_name}"
+                    output_norm,
+                    ref_norm,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"multi_tensor_l2norm total norm mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -107,13 +114,18 @@ def test_multi_tensor_l2norm(self, num_tensors=4, shape=(64, 128)):
                 print(f"    ✗ {backend_name}: {e}")
 
     def test_multi_tensor_l2norm_per_tensor(self, num_tensors=4, shape=(64, 128)):
-        print(f"\n  Testing multi_tensor_l2norm per_tensor with {num_tensors} tensors of shape {shape}")
+        print(
+            f"\n  Testing multi_tensor_l2norm per_tensor with {num_tensors} tensors of shape"
+            f" {shape}"
+        )
 
         for backend_name in self.backends:
             backend = get_backend(backend_name)
             try:
-                tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                           for _ in range(num_tensors)]
+                tensors = [
+                    generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
 
                 # Reference computation
                 ref_norms = self._reference_multi_tensor_l2norm(tensors, per_tensor=True)
@@ -121,10 +133,7 @@ def test_multi_tensor_l2norm_per_tensor(self, num_tensors=4, shape=(64, 128)):
                 # Backend computation
                 noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
                 output_norms = backend.multi_tensor_l2norm(
-                    chunk_size=2048,
-                    noop_flag=noop_flag,
-                    tensor_lists=[tensors],
-                    per_tensor=True
+                    chunk_size=2048, noop_flag=noop_flag, tensor_lists=[tensors], per_tensor=True
                 )
 
                 # CUDA backend returns tuple (total_norm, per_tensor_norms), extract second element
@@ -133,8 +142,11 @@ def test_multi_tensor_l2norm_per_tensor(self, num_tensors=4, shape=(64, 128)):
 
                 for i, (output, reference) in enumerate(zip(output_norms, ref_norms)):
                     self.assert_close(
-                        output, reference, rtol=1e-4, atol=1e-6,
-                        msg=f"multi_tensor_l2norm per_tensor {i} mismatch for {backend_name}"
+                        output,
+                        reference,
+                        rtol=1e-4,
+                        atol=1e-6,
+                        msg=f"multi_tensor_l2norm per_tensor {i} mismatch for {backend_name}",
                     )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -158,10 +170,14 @@ def test_multi_tensor_adam(self, num_tensors=3, shape=(32, 64)):
             backend = get_backend(backend_name)
             try:
                 # Create tensors for backend test
-                params = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                          for _ in range(num_tensors)]
-                grads = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                         for _ in range(num_tensors)]
+                params = [
+                    generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
+                grads = [
+                    generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
                 exp_avgs = [torch.zeros_like(p) for p in params]
                 exp_avg_sqs = [torch.zeros_like(p) for p in params]
 
@@ -172,8 +188,8 @@ def test_multi_tensor_adam(self, num_tensors=3, shape=(32, 64)):
                 ref_exp_avg_sqs = [torch.zeros_like(p) for p in params]
 
                 # Apply reference Adam step (matching the torch implementation)
-                bias_correction1 = 1 - beta1 ** step
-                bias_correction2 = 1 - beta2 ** step
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
 
                 for p, g, m, v in zip(ref_params, ref_grads, ref_exp_avgs, ref_exp_avg_sqs):
                     # AdamW style: weight decay applied to param first
@@ -205,14 +221,17 @@ def test_multi_tensor_adam(self, num_tensors=3, shape=(32, 64)):
                     step=step,
                     mode=1,  # AdamW mode
                     bias_correction=1,
-                    weight_decay=weight_decay
+                    weight_decay=weight_decay,
                 )
 
                 # Compare results with relaxed tolerance
                 for i, (output, reference) in enumerate(zip(params, ref_params)):
                     self.assert_close(
-                        output, reference, rtol=1e-3, atol=1e-5,
-                        msg=f"multi_tensor_adam param {i} mismatch for {backend_name}"
+                        output,
+                        reference,
+                        rtol=1e-3,
+                        atol=1e-5,
+                        msg=f"multi_tensor_adam param {i} mismatch for {backend_name}",
                     )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -252,17 +271,27 @@ def _param_remainder_to_fp32(self, param, remainder):
         return (high | low).view(torch.float32)
 
     def _reference_adam_param_remainder(
-        self, grads, params, exp_avgs, exp_avg_sqs, param_remainders,
-        lr, beta1, beta2, epsilon, step, mode, bias_correction, weight_decay
+        self,
+        grads,
+        params,
+        exp_avgs,
+        exp_avg_sqs,
+        param_remainders,
+        lr,
+        beta1,
+        beta2,
+        epsilon,
+        step,
+        mode,
+        bias_correction,
+        weight_decay,
     ):
         """Pure-PyTorch reference for multi_tensor_adam_param_remainder."""
-        bc1 = 1 - beta1 ** step if bias_correction else 1.0
-        bc2 = 1 - beta2 ** step if bias_correction else 1.0
-        is_adamw = (mode == 1)
+        bc1 = 1 - beta1**step if bias_correction else 1.0
+        bc2 = 1 - beta2**step if bias_correction else 1.0
+        is_adamw = mode == 1
 
-        for g, p, m, v, p_rem in zip(
-            grads, params, exp_avgs, exp_avg_sqs, param_remainders
-        ):
+        for g, p, m, v, p_rem in zip(grads, params, exp_avgs, exp_avg_sqs, param_remainders):
             g_float = g.float()
             param_master = self._param_remainder_to_fp32(p, p_rem)
 
@@ -287,7 +316,10 @@ def _reference_adam_param_remainder(
             p_rem.copy_(new_rem)
 
     def test_multi_tensor_adam_param_remainder(self, num_tensors=3, shape=(32, 64)):
-        print(f"\n  Testing multi_tensor_adam_param_remainder with {num_tensors} tensors of shape {shape}")
+        print(
+            f"\n  Testing multi_tensor_adam_param_remainder with {num_tensors} tensors of shape"
+            f" {shape}"
+        )
 
         lr = 0.001
         beta1 = 0.9
@@ -301,10 +333,14 @@ def test_multi_tensor_adam_param_remainder(self, num_tensors=3, shape=(32, 64)):
             backend = get_backend(backend_name)
             try:
                 # Create FP32 master weights, then split into param + remainder
-                master_weights = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                                  for _ in range(num_tensors)]
-                grads = [generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
-                         for _ in range(num_tensors)]
+                master_weights = [
+                    generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
+                grads = [
+                    generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
+                    for _ in range(num_tensors)
+                ]
 
                 params = []
                 remainders = []
@@ -313,10 +349,14 @@ def test_multi_tensor_adam_param_remainder(self, num_tensors=3, shape=(32, 64)):
                     params.append(p.clone())
                     remainders.append(r.clone())
 
-                exp_avgs = [torch.zeros(shape, dtype=torch.float32, device=self.device)
-                            for _ in range(num_tensors)]
-                exp_avg_sqs = [torch.zeros(shape, dtype=torch.float32, device=self.device)
-                               for _ in range(num_tensors)]
+                exp_avgs = [
+                    torch.zeros(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
+                exp_avg_sqs = [
+                    torch.zeros(shape, dtype=torch.float32, device=self.device)
+                    for _ in range(num_tensors)
+                ]
 
                 # Clone for reference
                 ref_params = [p.clone() for p in params]
@@ -327,8 +367,19 @@ def test_multi_tensor_adam_param_remainder(self, num_tensors=3, shape=(32, 64)):
 
                 # Reference step
                 self._reference_adam_param_remainder(
-                    ref_grads, ref_params, ref_exp_avgs, ref_exp_avg_sqs, ref_remainders,
-                    lr, beta1, beta2, eps, step, mode, 1, weight_decay,
+                    ref_grads,
+                    ref_params,
+                    ref_exp_avgs,
+                    ref_exp_avg_sqs,
+                    ref_remainders,
+                    lr,
+                    beta1,
+                    beta2,
+                    eps,
+                    step,
+                    mode,
+                    1,
+                    weight_decay,
                 )
 
                 # Backend step
@@ -352,16 +403,34 @@ def test_multi_tensor_adam_param_remainder(self, num_tensors=3, shape=(32, 64)):
                     out_fp32 = self._param_remainder_to_fp32(params[i], remainders[i])
                     ref_fp32 = self._param_remainder_to_fp32(ref_params[i], ref_remainders[i])
                     self.assert_close(
-                        out_fp32, ref_fp32, rtol=1e-5, atol=1e-7,
-                        msg=f"multi_tensor_adam_param_remainder param {i} mismatch for {backend_name}"
+                        out_fp32,
+                        ref_fp32,
+                        rtol=1e-5,
+                        atol=1e-7,
+                        msg=(
+                            f"multi_tensor_adam_param_remainder param {i} mismatch for"
+                            f" {backend_name}"
+                        ),
                     )
                     self.assert_close(
-                        exp_avgs[i], ref_exp_avgs[i], rtol=1e-5, atol=1e-7,
-                        msg=f"multi_tensor_adam_param_remainder exp_avg {i} mismatch for {backend_name}"
+                        exp_avgs[i],
+                        ref_exp_avgs[i],
+                        rtol=1e-5,
+                        atol=1e-7,
+                        msg=(
+                            f"multi_tensor_adam_param_remainder exp_avg {i} mismatch for"
+                            f" {backend_name}"
+                        ),
                     )
                     self.assert_close(
-                        exp_avg_sqs[i], ref_exp_avg_sqs[i], rtol=1e-5, atol=1e-7,
-                        msg=f"multi_tensor_adam_param_remainder exp_avg_sq {i} mismatch for {backend_name}"
+                        exp_avg_sqs[i],
+                        ref_exp_avg_sqs[i],
+                        rtol=1e-5,
+                        atol=1e-7,
+                        msg=(
+                            f"multi_tensor_adam_param_remainder exp_avg_sq {i} mismatch for"
+                            f" {backend_name}"
+                        ),
                     )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -387,18 +456,24 @@ def _reference_multi_tensor_unscale_l2norm(self, tensors, inv_scale, per_tensor=
             return torch.sqrt(total_norm_sq)
 
     def test_multi_tensor_unscale_l2norm(self, num_tensors=4, shape=(64, 128)):
-        print(f"\n  Testing multi_tensor_unscale_l2norm with {num_tensors} tensors of shape {shape}")
+        print(
+            f"\n  Testing multi_tensor_unscale_l2norm with {num_tensors} tensors of shape {shape}"
+        )
 
         # Note: scale parameter is actually inv_scale (1/loss_scale)
         # For AMP with loss_scale=1024, inv_scale would be 1/1024
         inv_scale_value = 0.5  # equivalent to loss_scale = 2.0
-        tensors = [generate_random_tensor(shape, dtype=torch.float32, device=self.device)
-                   for _ in range(num_tensors)]
+        tensors = [
+            generate_random_tensor(shape, dtype=torch.float32, device=self.device)
+            for _ in range(num_tensors)
+        ]
         noop_flag = torch.tensor([0], dtype=torch.int32, device=self.device)
         inv_scale = torch.tensor([inv_scale_value], dtype=torch.float32, device=self.device)
 
         # Compute mathematical reference
-        reference_norm = self._reference_multi_tensor_unscale_l2norm(tensors, inv_scale, per_tensor=False)
+        reference_norm = self._reference_multi_tensor_unscale_l2norm(
+            tensors, inv_scale, per_tensor=False
+        )
 
         for backend_name in self.backends:
             backend = get_backend(backend_name)
@@ -408,7 +483,7 @@ def test_multi_tensor_unscale_l2norm(self, num_tensors=4, shape=(64, 128)):
                     noop_flag=noop_flag,
                     tensor_lists=[tensors],
                     inv_scale=inv_scale,
-                    per_tensor=False
+                    per_tensor=False,
                 )
 
                 # CUDA backend returns tuple (norm, per_tensor_norms), extract the first element
@@ -416,8 +491,11 @@ def test_multi_tensor_unscale_l2norm(self, num_tensors=4, shape=(64, 128)):
                     output_norm = output_norm[0]
 
                 self.assert_close(
-                    output_norm, reference_norm, rtol=1e-4, atol=1e-6,
-                    msg=f"multi_tensor_unscale_l2norm mismatch for {backend_name}"
+                    output_norm,
+                    reference_norm,
+                    rtol=1e-4,
+                    atol=1e-6,
+                    msg=f"multi_tensor_unscale_l2norm mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -428,9 +506,9 @@ def test_multi_tensor_unscale_l2norm(self, num_tensors=4, shape=(64, 128)):
                 print(f"    ✗ {backend_name}: {e}")
 
     def run_all_tests(self):
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Testing Optimizer Operations")
-        print("="*60)
+        print("=" * 60)
         print(f"Available backends: {', '.join(self.backends)}")
 
         # multi_tensor_scale tests
diff --git a/transformer_engine/plugin/tests/test_policy.py b/transformer_engine/plugin/tests/test_policy.py
index f56f5f2833..35b102a104 100644
--- a/transformer_engine/plugin/tests/test_policy.py
+++ b/transformer_engine/plugin/tests/test_policy.py
@@ -34,6 +34,7 @@ def setUp(self):
             PREFER_VENDOR,
             PREFER_REFERENCE,
         )
+
         self.SelectionPolicy = SelectionPolicy
         self.PREFER_DEFAULT = PREFER_DEFAULT
         self.PREFER_VENDOR = PREFER_VENDOR
@@ -170,16 +171,24 @@ def setUp(self):
             PolicyManager,
             reset_global_policy,
         )
+
         reset_global_policy()
         self.PolicyManager = PolicyManager
 
     def tearDown(self):
         """Clean up after each test"""
         from transformer_engine.plugin.core.policy import reset_global_policy
+
         reset_global_policy()
         # Clear any test environment variables
-        for key in ["TE_FL_PREFER", "TE_FL_PREFER_VENDOR", "TE_FL_STRICT",
-                    "TE_FL_DENY_VENDORS", "TE_FL_ALLOW_VENDORS", "TE_FL_PER_OP"]:
+        for key in [
+            "TE_FL_PREFER",
+            "TE_FL_PREFER_VENDOR",
+            "TE_FL_STRICT",
+            "TE_FL_DENY_VENDORS",
+            "TE_FL_ALLOW_VENDORS",
+            "TE_FL_PER_OP",
+        ]:
             os.environ.pop(key, None)
 
     def test_singleton_pattern(self):
@@ -247,18 +256,32 @@ class TestEnvironmentVariables(unittest.TestCase):
     def setUp(self):
         """Clear environment and reset policy"""
         from transformer_engine.plugin.core.policy import reset_global_policy
+
         reset_global_policy()
         # Clear all test env vars
-        for key in ["TE_FL_PREFER", "TE_FL_PREFER_VENDOR", "TE_FL_STRICT",
-                    "TE_FL_DENY_VENDORS", "TE_FL_ALLOW_VENDORS", "TE_FL_PER_OP"]:
+        for key in [
+            "TE_FL_PREFER",
+            "TE_FL_PREFER_VENDOR",
+            "TE_FL_STRICT",
+            "TE_FL_DENY_VENDORS",
+            "TE_FL_ALLOW_VENDORS",
+            "TE_FL_PER_OP",
+        ]:
             os.environ.pop(key, None)
 
     def tearDown(self):
         """Clean up environment"""
-        for key in ["TE_FL_PREFER", "TE_FL_PREFER_VENDOR", "TE_FL_STRICT",
-                    "TE_FL_DENY_VENDORS", "TE_FL_ALLOW_VENDORS", "TE_FL_PER_OP"]:
+        for key in [
+            "TE_FL_PREFER",
+            "TE_FL_PREFER_VENDOR",
+            "TE_FL_STRICT",
+            "TE_FL_DENY_VENDORS",
+            "TE_FL_ALLOW_VENDORS",
+            "TE_FL_PER_OP",
+        ]:
             os.environ.pop(key, None)
         from transformer_engine.plugin.core.policy import reset_global_policy
+
         reset_global_policy()
 
     def test_te_fl_prefer_flagos(self):
@@ -266,6 +289,7 @@ def test_te_fl_prefer_flagos(self):
         os.environ["TE_FL_PREFER"] = "flagos"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.prefer, "flagos")
@@ -276,6 +300,7 @@ def test_te_fl_prefer_vendor(self):
         os.environ["TE_FL_PREFER"] = "vendor"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.prefer, "vendor")
@@ -286,6 +311,7 @@ def test_te_fl_prefer_reference(self):
         os.environ["TE_FL_PREFER"] = "reference"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.prefer, "reference")
@@ -296,6 +322,7 @@ def test_te_fl_prefer_vendor_legacy(self):
         os.environ["TE_FL_PREFER_VENDOR"] = "1"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.prefer, "vendor")
@@ -307,6 +334,7 @@ def test_te_fl_prefer_overrides_legacy(self):
         os.environ["TE_FL_PREFER_VENDOR"] = "1"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.prefer, "reference")  # TE_FL_PREFER wins
@@ -317,6 +345,7 @@ def test_te_fl_strict(self):
         os.environ["TE_FL_STRICT"] = "1"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertTrue(policy.strict)
@@ -327,6 +356,7 @@ def test_te_fl_deny_vendors(self):
         os.environ["TE_FL_DENY_VENDORS"] = "rocm,dcu,intel"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.deny_vendors, frozenset({"rocm", "dcu", "intel"}))
@@ -337,6 +367,7 @@ def test_te_fl_allow_vendors(self):
         os.environ["TE_FL_ALLOW_VENDORS"] = "cuda,rocm"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.allow_vendors, frozenset({"cuda", "rocm"}))
@@ -347,6 +378,7 @@ def test_te_fl_per_op(self):
         os.environ["TE_FL_PER_OP"] = "layernorm_fwd=vendor|flagos;rmsnorm_fwd=flagos|reference"
 
         from transformer_engine.plugin.core.policy import policy_from_env
+
         policy = policy_from_env()
 
         self.assertEqual(policy.get_per_op_order("layernorm_fwd"), ["vendor", "flagos"])
@@ -360,11 +392,13 @@ class TestContextManagers(unittest.TestCase):
     def setUp(self):
         """Reset policy before each test"""
         from transformer_engine.plugin.core.policy import reset_global_policy
+
         reset_global_policy()
 
     def tearDown(self):
         """Clean up after test"""
         from transformer_engine.plugin.core.policy import reset_global_policy
+
         reset_global_policy()
 
     def test_policy_context(self):
diff --git a/transformer_engine/plugin/tests/test_softmax.py b/transformer_engine/plugin/tests/test_softmax.py
index f1272a4773..8bdf29dcc3 100644
--- a/transformer_engine/plugin/tests/test_softmax.py
+++ b/transformer_engine/plugin/tests/test_softmax.py
@@ -16,8 +16,7 @@
 class SoftmaxTests(TestCase):
     def __init__(self, device="cpu"):
         super().__init__(
-            "Softmax Operations",
-            "Test correctness of all softmax operations across backends"
+            "Softmax Operations", "Test correctness of all softmax operations across backends"
         )
         self.backends = get_available_backends()
         self.device = device
@@ -34,8 +33,11 @@ def test_scaled_softmax_forward(self, shape=(2, 4, 8, 16)):
             try:
                 output = backend.scaled_softmax_forward(x, scale)
                 self.assert_close(
-                    output, reference, rtol=1e-2, atol=1e-3,
-                    msg=f"Scaled softmax forward mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=1e-2,
+                    atol=1e-3,
+                    msg=f"Scaled softmax forward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -49,7 +51,9 @@ def test_scaled_softmax_backward(self, shape=(2, 4, 8, 16)):
         print(f"\n  Testing scaled softmax backward with shape {shape}")
 
         # Use bf16 for all computation to match backend precision
-        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
         scale = 0.125
         grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
 
@@ -71,8 +75,11 @@ def test_scaled_softmax_backward(self, shape=(2, 4, 8, 16)):
                     grad_output.clone(), softmax_out_test.clone(), scale
                 )
                 self.assert_close(
-                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
-                    msg=f"Scaled softmax backward mismatch for {backend_name}"
+                    grad_input.float(),
+                    reference_grad,
+                    rtol=1e-2,
+                    atol=1e-2,
+                    msg=f"Scaled softmax backward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -98,7 +105,7 @@ def test_scaled_masked_softmax_forward(self, shape=(2, 4, 8, 16)):
 
         # Additive mask for reference computation
         additive_mask = torch.zeros((batch, 1, seq_q, seq_k), dtype=x.dtype, device=self.device)
-        additive_mask = additive_mask.masked_fill(bool_mask, float('-inf'))
+        additive_mask = additive_mask.masked_fill(bool_mask, float("-inf"))
         additive_mask_expanded = additive_mask.expand(shape)
 
         # Reference: F.softmax(x * scale + additive_mask, dim=-1)
@@ -112,8 +119,11 @@ def test_scaled_masked_softmax_forward(self, shape=(2, 4, 8, 16)):
             try:
                 output = backend.scaled_masked_softmax_forward(x_test, uint8_mask, scale)
                 self.assert_close(
-                    output.float(), reference.float(), rtol=1e-2, atol=1e-3,
-                    msg=f"Scaled masked softmax forward mismatch for {backend_name}"
+                    output.float(),
+                    reference.float(),
+                    rtol=1e-2,
+                    atol=1e-3,
+                    msg=f"Scaled masked softmax forward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -127,7 +137,9 @@ def test_scaled_masked_softmax_backward(self, shape=(2, 4, 8, 16)):
         print(f"\n  Testing scaled masked softmax backward with shape {shape}")
 
         # Use bf16 for all computation
-        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
         scale = 0.125
         grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
 
@@ -149,8 +161,11 @@ def test_scaled_masked_softmax_backward(self, shape=(2, 4, 8, 16)):
                     grad_output.clone(), softmax_out_test.clone(), scale
                 )
                 self.assert_close(
-                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
-                    msg=f"Scaled masked softmax backward mismatch for {backend_name}"
+                    grad_input.float(),
+                    reference_grad,
+                    rtol=1e-2,
+                    atol=1e-2,
+                    msg=f"Scaled masked softmax backward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -168,8 +183,8 @@ def test_scaled_upper_triang_masked_softmax_forward(self, shape=(8, 16, 16)):
         seq_len = shape[-1]
 
         causal_mask = torch.triu(
-            torch.full((seq_len, seq_len), float('-inf'), dtype=x.dtype, device=self.device),
-            diagonal=1
+            torch.full((seq_len, seq_len), float("-inf"), dtype=x.dtype, device=self.device),
+            diagonal=1,
         )
         reference = F.softmax(x.float() * scale + causal_mask.float(), dim=-1).to(x.dtype)
 
@@ -178,8 +193,11 @@ def test_scaled_upper_triang_masked_softmax_forward(self, shape=(8, 16, 16)):
             try:
                 output = backend.scaled_upper_triang_masked_softmax_forward(x, scale)
                 self.assert_close(
-                    output, reference, rtol=1e-2, atol=1e-3,
-                    msg=f"Scaled upper triang masked softmax forward mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=1e-2,
+                    atol=1e-3,
+                    msg=f"Scaled upper triang masked softmax forward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -193,14 +211,16 @@ def test_scaled_upper_triang_masked_softmax_backward(self, shape=(8, 16, 16)):
         print(f"\n  Testing scaled upper triang masked softmax backward with shape {shape}")
 
         # Use bf16 for all computation
-        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
         scale = 0.125
         seq_len = shape[-1]
         grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
 
         causal_mask = torch.triu(
-            torch.full((seq_len, seq_len), float('-inf'), dtype=torch.float32, device=self.device),
-            diagonal=1
+            torch.full((seq_len, seq_len), float("-inf"), dtype=torch.float32, device=self.device),
+            diagonal=1,
         )
 
         # Compute reference gradient using autograd (in float32 for precision)
@@ -221,8 +241,11 @@ def test_scaled_upper_triang_masked_softmax_backward(self, shape=(8, 16, 16)):
                     grad_output.clone(), softmax_out_test.clone(), scale
                 )
                 self.assert_close(
-                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
-                    msg=f"Scaled upper triang masked softmax backward mismatch for {backend_name}"
+                    grad_input.float(),
+                    reference_grad,
+                    rtol=1e-2,
+                    atol=1e-2,
+                    msg=f"Scaled upper triang masked softmax backward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -245,8 +268,8 @@ def test_scaled_aligned_causal_masked_softmax_forward(self, shape=(2, 4, 16, 16)
 
         # Aligned causal mask (lower triangular)
         causal_mask = torch.triu(
-            torch.full((seq_len, seq_len), float('-inf'), dtype=x.dtype, device=self.device),
-            diagonal=1
+            torch.full((seq_len, seq_len), float("-inf"), dtype=x.dtype, device=self.device),
+            diagonal=1,
         )
         reference = F.softmax(x.float() * scale + causal_mask.float(), dim=-1).to(x.dtype)
 
@@ -255,8 +278,11 @@ def test_scaled_aligned_causal_masked_softmax_forward(self, shape=(2, 4, 16, 16)
             try:
                 output = backend.scaled_aligned_causal_masked_softmax_forward(x, scale)
                 self.assert_close(
-                    output, reference, rtol=1e-2, atol=1e-3,
-                    msg=f"Scaled aligned causal masked softmax forward mismatch for {backend_name}"
+                    output,
+                    reference,
+                    rtol=1e-2,
+                    atol=1e-3,
+                    msg=f"Scaled aligned causal masked softmax forward mismatch for {backend_name}",
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -274,14 +300,16 @@ def test_scaled_aligned_causal_masked_softmax_backward(self, shape=(2, 4, 16, 16
         print(f"\n  Testing scaled aligned causal masked softmax backward with shape {shape}")
 
         # Use bf16 for all computation
-        x = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device, requires_grad=True)
+        x = generate_random_tensor(
+            shape, dtype=torch.bfloat16, device=self.device, requires_grad=True
+        )
         scale = 0.125
         seq_len = shape[-1]
         grad_output = generate_random_tensor(shape, dtype=torch.bfloat16, device=self.device)
 
         causal_mask = torch.triu(
-            torch.full((seq_len, seq_len), float('-inf'), dtype=torch.float32, device=self.device),
-            diagonal=1
+            torch.full((seq_len, seq_len), float("-inf"), dtype=torch.float32, device=self.device),
+            diagonal=1,
         )
 
         # Compute reference gradient using autograd (in float32 for precision)
@@ -302,8 +330,13 @@ def test_scaled_aligned_causal_masked_softmax_backward(self, shape=(2, 4, 16, 16
                     grad_output.clone(), softmax_out_test.clone(), scale
                 )
                 self.assert_close(
-                    grad_input.float(), reference_grad, rtol=1e-2, atol=1e-2,
-                    msg=f"Scaled aligned causal masked softmax backward mismatch for {backend_name}"
+                    grad_input.float(),
+                    reference_grad,
+                    rtol=1e-2,
+                    atol=1e-2,
+                    msg=(
+                        f"Scaled aligned causal masked softmax backward mismatch for {backend_name}"
+                    ),
                 )
                 print(f"    ✓ {backend_name}")
             except NotImplementedError:
@@ -314,9 +347,9 @@ def test_scaled_aligned_causal_masked_softmax_backward(self, shape=(2, 4, 16, 16
                 print(f"    ✗ {backend_name}: {e}")
 
     def run_all_tests(self):
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Testing Softmax Operations")
-        print("="*60)
+        print("=" * 60)
         print(f"Available backends: {', '.join(self.backends)}")
 
         # Scaled softmax tests
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index d62bcc92ac..4e5a79e668 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -65,7 +65,7 @@
 # Save reference to native FlashAttention for fallback
 _FlashAttentionNative = FlashAttention
 # Use plugin system's flash_attention if available, otherwise use native
-FlashAttention = getattr(tex, 'flash_attention', _FlashAttentionNative)
+FlashAttention = getattr(tex, "flash_attention", _FlashAttentionNative)
 # Save the original get_attention_backend for backends that want to use default logic
 # CUDA backend can access this via dpa_utils._original_get_attention_backend
 dpa_utils._original_get_attention_backend = dpa_utils.get_attention_backend
diff --git a/transformer_engine/pytorch/ops/basic/rmsnorm.py b/transformer_engine/pytorch/ops/basic/rmsnorm.py
index 05597a14fa..1c4a19034f 100644
--- a/transformer_engine/pytorch/ops/basic/rmsnorm.py
+++ b/transformer_engine/pytorch/ops/basic/rmsnorm.py
@@ -27,7 +27,6 @@
 from .._common import maybe_autocast_dtype, maybe_dequantize
 
 
-
 class RMSNorm(BasicOperation):
     r"""Root Mean Square Layer Normalization
 
diff --git a/transformer_engine/pytorch/optimizers/__init__.py b/transformer_engine/pytorch/optimizers/__init__.py
index e54a17ae78..a19c797dea 100644
--- a/transformer_engine/pytorch/optimizers/__init__.py
+++ b/transformer_engine/pytorch/optimizers/__init__.py
@@ -13,4 +13,4 @@
 )
 from .fused_adam import FusedAdam
 from .fused_sgd import FusedSGD
-from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier
\ No newline at end of file
+from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier

From 47e8ee72d1b1e7a37cd8d6a7aae1950be0148e48 Mon Sep 17 00:00:00 2001
From: lihongyang1990 <119582226+lihongyang1990@users.noreply.github.com>
Date: Tue, 3 Mar 2026 10:03:41 +0800
Subject: [PATCH 36/59] Refactor optimizer implementations and improve
 multi_tensor ops (#36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Refactor and improve the FlagOS optimizer and multi_tensor
implementations to better match CUDA behavior and improve code quality.

## Changes

### `fused_adam.py` (FlagOS backend)
- Remove unused `inv_scale` and `out_dtype` parameters from
`multi_tensor_adam_fl`
- `multi_tensor_adam_param_remainder_fl`: rewrite FP32 master weight
reconstruction using bit manipulation (int16 high/low bits), matching
the CUDA implementation exactly

### `multi_tensor.py` (FlagOS backend)
- `multi_tensor_l2_norm_fl`: add proper type hints, noop_flag check,
inf/nan detection, and replace raw `**` / `+` operators with
`flag_gems.mul` / `flag_gems.add`
- `multi_tensor_scale_fl`: add type hints, noop_flag check, inf/nan
detection, and replace `src * scale` with `flag_gems.mul(src, scale)`

### `optimizer.py` (reference backend)
- Update `multi_tensor_l2norm_torch` and `multi_tensor_adam_torch` to
match new signatures and CUDA behavior (L2 vs AdamW mode split)
- Rewrite `multi_tensor_adam_param_remainder_torch` with bit
manipulation matching CUDA
- Rename `eps` → `epsilon` for consistency

### `optimizers/__init__.py`
- Export `multi_tensor_scale` and `multi_tensor_l2norm`

### Misc
- Fix missing newline at end of files
---
 .../core/backends/flagos/impl/fused_adam.py   | 126 +++++------
 .../core/backends/flagos/impl/multi_tensor.py |  60 ++++-
 .../core/backends/reference/impl/optimizer.py | 211 ++++++++++++------
 .../pytorch/optimizers/__init__.py            |   2 +
 4 files changed, 255 insertions(+), 144 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
index f148795381..95602c731f 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/fused_adam.py
@@ -2,7 +2,7 @@
 #
 # See LICENSE for license information.
 
-from typing import Optional, List
+from typing import List
 import torch
 import flag_gems
 
@@ -19,8 +19,6 @@ def multi_tensor_adam_fl(
     mode: int,
     bias_correction: int,
     weight_decay: float,
-    inv_scale: Optional[float] = 1.0,
-    out_dtype: Optional[torch.dtype] = None,
 ) -> None:
 
     num_lists = len(tensor_lists)
@@ -50,9 +48,6 @@ def multi_tensor_adam_fl(
         if not g.is_contiguous():
             g = g.contiguous()
 
-        if inv_scale is not None and inv_scale != 1.0:
-            g = flag_gems.mul(g, inv_scale)
-
         m = flag_gems.add_(flag_gems.mul_(m, beta1), g, alpha=1 - beta1)
         v = flag_gems.add_(
             flag_gems.mul_(v, beta2), flag_gems.mul_(flag_gems.mul_(g, g), 1 - beta2)
@@ -75,8 +70,6 @@ def multi_tensor_adam_fl(
 
         if p_master is not None:
             flag_gems.copy_(p_master, p)
-            out_dtype = p_master.dtype if out_dtype is None else out_dtype
-            p.data = p.data.to(out_dtype)
 
 
 def multi_tensor_adam_param_remainder_fl(
@@ -91,27 +84,9 @@ def multi_tensor_adam_param_remainder_fl(
     mode: int,
     bias_correction: int,
     weight_decay: float,
-    inv_scale: Optional[float] = 1.0,
 ) -> None:
     """
     Adam optimizer with parameter remainders for BF16 precision (FlagOS implementation).
-
-    This variant stores BF16 parameters + int16 remainders to reconstruct FP32 master weights.
-    Used when you have BF16 params and need FP32 master params without storing full FP32 copies.
-
-    Args:
-        chunk_size: Chunk size for processing (unused in this implementation)
-        noop_flag: If non-zero, skip computation
-        tensor_lists: [grads, params (bf16), exp_avgs (fp32), exp_avg_sqs (fp32), param_remainders (int16)]
-        lr: Learning rate
-        beta1: First moment decay rate
-        beta2: Second moment decay rate
-        eps: Epsilon for numerical stability
-        step: Current optimization step
-        mode: 0 = L2 regularization, 1 = AdamW (decoupled weight decay)
-        bias_correction: Whether to apply bias correction (1 = yes, 0 = no)
-        weight_decay: Weight decay coefficient
-        inv_scale: Inverse gradient scale for mixed precision training
     """
     if noop_flag.item() != 0:
         return
@@ -135,65 +110,78 @@ def multi_tensor_adam_param_remainder_fl(
 
     for i in range(num_tensors):
         g = tensor_lists[0][i]
-        p = tensor_lists[1][i]  # BF16 parameter
+        p = tensor_lists[1][i]  # int16 parameter (high 16 bits of FP32)
         m = tensor_lists[2][i]  # FP32 first moment
         v = tensor_lists[3][i]  # FP32 second moment
-        p_remainder = tensor_lists[4][i]  # int16 remainder
+        p_remainder = tensor_lists[4][i]  # int16 remainder (low 16 bits of FP32)
 
         if not g.is_contiguous():
             g = g.contiguous()
 
-        # Apply gradient unscaling if needed
-        if inv_scale is not None and inv_scale != 1.0:
-            g = flag_gems.mul(g, inv_scale)
+        # Convert gradient to float
+        g_float = g.float()
 
-        # Reconstruct FP32 master weight from BF16 param + int16 remainder
-        # The remainder represents the lower 16 bits lost in BF16 conversion
-        param_fp32 = p.float()
-        param_master = flag_gems.add(param_fp32, flag_gems.mul(p_remainder.float(), 2.0**-16))
+        # Reconstruct FP32 master weight from int16 param + int16 remainder using bit manipulation
+        # This matches the CUDA implementation exactly:
+        # 1. If p_remainder < 0, decrement p (undo rounding)
+        # 2. Combine high 16 bits (p) and low 16 bits (p_remainder) into FP32
+        # Note: Use PyTorch native ops for bit manipulation (int16/int32 operations)
 
-        # Compute gradient with weight decay (if L2 mode)
-        grad_with_decay = g.float()
-        if not is_adamw:  # L2 regularization mode
-            grad_with_decay = flag_gems.add(
-                grad_with_decay, flag_gems.mul(param_master, weight_decay)
-            )
+        local_p = p.view(torch.int16).clone()
+        local_p_rem = p_remainder.clone()
 
-        # Update moments
-        m = flag_gems.add_(flag_gems.mul_(m, beta1), grad_with_decay, alpha=1 - beta1)
-        v = flag_gems.add_(
-            flag_gems.mul_(v, beta2),
-            flag_gems.mul_(flag_gems.mul_(grad_with_decay, grad_with_decay), 1 - beta2),
-        )
+        # Undo rounding: if remainder < 0, decrement p
+        local_p = torch.where(local_p_rem < 0, local_p - 1, local_p)
+
+        # Combine into FP32 using bit shift operations
+        # local_p is high 16 bits, local_p_rem is low 16 bits
+        high_bits = local_p.to(torch.int32) << 16
+        low_bits = local_p_rem.to(torch.int32) & 0xFFFF  # Mask off sign extension
+        param_int32 = high_bits | low_bits
+        param_master = param_int32.view(torch.float32)
+
+        # L2 mode: add weight decay to gradient before updating moments
+        if not is_adamw and weight_decay != 0:
+            g_float = flag_gems.add(g_float, param_master, alpha=weight_decay)
+
+        # Update first moment: m = beta1 * m + (1 - beta1) * g
+        flag_gems.add_(flag_gems.mul_(m, beta1), g_float, alpha=1 - beta1)
+
+        # Update second moment: v = beta2 * v + (1 - beta2) * g^2
+        flag_gems.add_(flag_gems.mul_(v, beta2), flag_gems.mul(g_float, g_float), alpha=1 - beta2)
 
         # Apply bias correction
-        m_corr = m.clone()
-        v_corr = v.clone()
-        if bias_correction == 1:
-            m_corr = flag_gems.true_divide(m_corr, bias_correction1)
-            v_corr = flag_gems.true_divide(v_corr, bias_correction2)
+        m_corr = flag_gems.true_divide(m, bias_correction1)
+        v_corr = flag_gems.true_divide(v, bias_correction2)
+
+        # Compute denominator: sqrt(v_corr) + eps
+        denom = flag_gems.add(flag_gems.sqrt(v_corr), eps)
 
         # Compute update
-        update = flag_gems.true_divide(m_corr, flag_gems.add(flag_gems.sqrt(v_corr), eps))
+        update = flag_gems.true_divide(m_corr, denom)
 
-        # Apply weight decay (if AdamW mode)
-        if is_adamw:
-            param_master = flag_gems.mul_(param_master, 1 - lr * weight_decay)
+        # AdamW mode: add decoupled weight decay to update
+        if is_adamw and weight_decay != 0:
+            update = flag_gems.add(update, param_master, alpha=weight_decay)
 
-        # Update master weight
-        param_master = flag_gems.add_(param_master, update, alpha=-lr)
+        # Update master weight: p = p - lr * update
+        param_master = flag_gems.sub(param_master, flag_gems.mul(update, lr))
 
-        # Split back into BF16 param + int16 remainder
-        # Convert to BF16 (this is the rounded version)
-        param_bf16 = param_master.to(dtype=p.dtype)
+        # Split FP32 back into int16 param + int16 remainder using bit manipulation
+        # This matches the CUDA implementation exactly:
+        # 1. Extract high 16 bits as p
+        # 2. Extract low 16 bits as p_remainder
+        # 3. If p_remainder < 0, increment p (round up)
+        # Note: Use PyTorch native ops for bit manipulation (int32 operations)
 
-        # Compute remainder: difference between FP32 master and BF16 representation
-        # Scale and quantize to int16 range
-        remainder_fp32 = flag_gems.mul(flag_gems.sub(param_master, param_bf16.float()), 2.0**16)
-        remainder_int16 = flag_gems.clamp(torch.round(remainder_fp32), -32768, 32767).to(
-            dtype=torch.int16
-        )
+        param_int32 = param_master.view(torch.int32)
+        # Extract low 16 bits (remainder) and high 16 bits (param)
+        new_p_rem = (param_int32 & 0xFFFF).to(torch.int16)
+        new_p = ((param_int32 >> 16) & 0xFFFF).to(torch.int16)
+
+        # Round up: if remainder < 0, increment p
+        new_p = torch.where(new_p_rem < 0, new_p + 1, new_p)
 
         # Write back
-        flag_gems.copy_(p, param_bf16)
-        flag_gems.copy_(p_remainder, remainder_int16)
+        flag_gems.copy_(p, new_p.view(torch.bfloat16))
+        flag_gems.copy_(p_remainder, new_p_rem)
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
index 4421487ff1..d728a76242 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/multi_tensor.py
@@ -2,25 +2,67 @@
 #
 # See LICENSE for license information.
 
+from typing import List, Tuple
 import torch
-from torch.distributed._tensor import DTensor
 import flag_gems
 
 
-def multi_tensor_l2_norm_fl(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
+def multi_tensor_l2_norm_fl(
+    _chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    per_tensor: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute L2 norm of tensors using flag_gems.
+
+    Returns:
+        Tuple of (total_norm, per_tensor_norms_or_dummy)
+        - total_norm: The combined L2 norm of all tensors
+        - per_tensor_norms_or_dummy: Per-tensor norms stacked if per_tensor=True, else dummy tensor
+    """
+    device = tensor_lists[0][0].device if tensor_lists and tensor_lists[0] else "cpu"
+
+    if noop_flag.item() != 0:
+        return torch.tensor(0.0, device=device), torch.tensor(0.0, device=device)
 
     tensors = tensor_lists[0]
 
+    # Compute per-tensor norms
+    per_tensor_norms = []
+    total_norm_sq = torch.tensor(0.0, device=device)
+
+    for tensor in tensors:
+        t_float = tensor.float()
+        norm_sq = flag_gems.sum(flag_gems.mul(t_float, t_float))
+        # Check for inf/nan (matches CUDA behavior)
+        if not torch.isfinite(norm_sq):
+            noop_flag.fill_(1)
+        total_norm_sq = flag_gems.add(total_norm_sq, norm_sq)
+        if per_tensor:
+            per_tensor_norms.append(flag_gems.sqrt(norm_sq))
+
+    total_norm = flag_gems.sqrt(total_norm_sq)
+
     if per_tensor:
-        norms = [torch.norm(t.float(), p=2) for t in tensors]
-        return norms, None
+        per_tensor_result = torch.stack(per_tensor_norms)
     else:
-        total_norm_sq = sum(flag_gems.sum(flag_gems.pow_func(t.float(), 2)) for t in tensors)
-        total_norm = flag_gems.sqrt(total_norm_sq)
-        return total_norm, None
+        per_tensor_result = torch.tensor(0.0, device=device)
+
+    return total_norm, per_tensor_result
 
 
-def multi_tensor_scale_fl(chunk_size, noop_flag, tensor_lists, scale):
+def multi_tensor_scale_fl(
+    _chunk_size: int,
+    noop_flag: torch.Tensor,
+    tensor_lists: List[List[torch.Tensor]],
+    scale: float,
+) -> None:
+    if noop_flag.item() != 0:
+        return
 
     for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-        flag_gems.copy_(dst, src * scale)
+        # Check for inf/nan (matches CUDA behavior for AMP gradient scaling)
+        if not torch.isfinite(src).all():
+            noop_flag.fill_(1)
+        flag_gems.copy_(dst, flag_gems.mul(src, scale))
diff --git a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
index ceac199837..890ae9a563 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/optimizer.py
@@ -2,7 +2,7 @@
 #
 # See LICENSE for license information.
 
-from typing import List, Union
+from typing import List, Tuple, Union
 import torch
 
 __all__ = [
@@ -33,6 +33,9 @@ def multi_tensor_scale_torch(
         raise ValueError("Output and input tensor lists must have the same length")
 
     for in_tensor, out_tensor in zip(input_tensors, output_tensors):
+        # Check for inf/nan (matches CUDA behavior for AMP gradient scaling)
+        if not torch.isfinite(in_tensor).all():
+            noop_flag.fill_(1)
         out_tensor.copy_(in_tensor * scale)
 
 
@@ -41,26 +44,43 @@ def multi_tensor_l2norm_torch(
     noop_flag: torch.Tensor,
     tensor_lists: List[List[torch.Tensor]],
     per_tensor: bool = False,
-) -> Union[torch.Tensor, List[torch.Tensor]]:
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute L2 norm of tensors.
+
+    Returns:
+        Tuple of (total_norm, per_tensor_norms_or_dummy)
+        - total_norm: The combined L2 norm of all tensors
+        - per_tensor_norms_or_dummy: Per-tensor norms stacked if per_tensor=True, else dummy tensor
+    """
+    device = tensor_lists[0][0].device if tensor_lists and tensor_lists[0] else "cpu"
+
     if noop_flag.item() != 0:
-        if per_tensor:
-            return [torch.tensor(0.0, device=t.device) for t in tensor_lists[0]]
-        else:
-            return torch.tensor(0.0, device=tensor_lists[0][0].device)
+        return torch.tensor(0.0, device=device), torch.tensor(0.0, device=device)
 
     tensors = tensor_lists[0]
 
+    # Compute per-tensor norms
+    per_tensor_norms = []
+    total_norm_sq = torch.tensor(0.0, device=device)
+
+    for tensor in tensors:
+        norm_sq = torch.sum(tensor.float() ** 2)
+        # Check for inf/nan (matches CUDA behavior)
+        if not torch.isfinite(norm_sq):
+            noop_flag.fill_(1)
+        total_norm_sq = total_norm_sq + norm_sq
+        if per_tensor:
+            per_tensor_norms.append(torch.sqrt(norm_sq))
+
+    total_norm = torch.sqrt(total_norm_sq)
+
     if per_tensor:
-        norms = []
-        for tensor in tensors:
-            norm = torch.norm(tensor.float(), p=2)
-            norms.append(norm)
-        return norms
+        per_tensor_result = torch.stack(per_tensor_norms)
     else:
-        total_norm_sq = torch.tensor(0.0, device=tensors[0].device)
-        for tensor in tensors:
-            total_norm_sq += torch.sum(tensor.float() ** 2)
-        return torch.sqrt(total_norm_sq)
+        per_tensor_result = torch.tensor(0.0, device=device)
+
+    return total_norm, per_tensor_result
 
 
 def multi_tensor_adam_torch(
@@ -70,12 +90,18 @@ def multi_tensor_adam_torch(
     lr: float,
     beta1: float,
     beta2: float,
-    eps: float,
+    epsilon: float,
     step: int,
     mode: int,
     bias_correction: int,
     weight_decay: float,
 ) -> None:
+    """
+    Adam optimizer implementation matching CUDA exactly.
+
+    mode == 0: L2 regularization (add weight_decay * param to gradient before moment update)
+    mode == 1: AdamW (add weight_decay * param to update after moment computation)
+    """
     if noop_flag.item() != 0:
         return
 
@@ -98,18 +124,43 @@ def multi_tensor_adam_torch(
         if grad is None:
             continue
 
-        if mode == 1 and weight_decay != 0:
-            param.mul_(1 - lr * weight_decay)
+        # Convert to float for computation (matches CUDA's MATH_T = float)
+        g = grad.float()
+        p = param.float()
+
+        if mode == 0:  # L2 regularization
+            # Add weight decay to gradient before moment update
+            g = g + weight_decay * p
+
+            # Update moments with modified gradient
+            exp_avg.mul_(beta1).add_(g, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(g, g, value=1 - beta2)
+
+            # Bias correction
+            m_corr = exp_avg / bias_correction1
+            v_corr = exp_avg_sq / bias_correction2
 
-        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+            # Compute update
+            denom = v_corr.sqrt().add_(epsilon)
+            update = m_corr / denom
 
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+            # Update parameter
+            param.add_(update, alpha=-lr)
+        else:  # mode == 1, AdamW (decoupled weight decay)
+            # Update moments with original gradient
+            exp_avg.mul_(beta1).add_(g, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(g, g, value=1 - beta2)
 
-        corrected_exp_avg = exp_avg / bias_correction1
-        corrected_exp_avg_sq = exp_avg_sq / bias_correction2
+            # Bias correction
+            m_corr = exp_avg / bias_correction1
+            v_corr = exp_avg_sq / bias_correction2
 
-        denom = corrected_exp_avg_sq.sqrt().add_(eps)
-        param.addcdiv_(corrected_exp_avg, denom, value=-lr)
+            # Compute update with weight decay added (matches CUDA exactly)
+            denom = v_corr.sqrt().add_(epsilon)
+            update = (m_corr / denom) + (weight_decay * p)
+
+            # Update parameter
+            param.add_(update, alpha=-lr)
 
 
 def multi_tensor_adam_param_remainder_torch(
@@ -119,7 +170,7 @@ def multi_tensor_adam_param_remainder_torch(
     lr: float,
     beta1: float,
     beta2: float,
-    eps: float,
+    epsilon: float,
     step: int,
     mode: int,
     bias_correction: int,
@@ -128,17 +179,30 @@ def multi_tensor_adam_param_remainder_torch(
     """
     Adam optimizer with parameter remainders for BF16 precision.
 
-    This variant stores BF16 parameters + int16 remainders to reconstruct FP32 master weights.
-    Used when you have BF16 params and need FP32 master params without storing full FP32 copies.
+    This variant stores BF16 parameters + int16 remainders to reconstruct FP32 master weights
+    using bit manipulation, matching the CUDA implementation exactly.
+
+    The CUDA implementation stores:
+    - p: int16 representing the high 16 bits of FP32 (viewed as BF16)
+    - p_remainder: int16 representing the low 16 bits of FP32
+
+    To reconstruct FP32:
+    - If p_remainder < 0, decrement p (undo rounding)
+    - Combine: fp32.int16[1] = p, fp32.int16[0] = p_remainder
+
+    To split FP32 back:
+    - p = fp32.int16[1] (high 16 bits)
+    - p_remainder = fp32.int16[0] (low 16 bits)
+    - If p_remainder < 0, increment p (round up)
 
     Args:
         chunk_size: Chunk size for processing (unused in PyTorch implementation)
         noop_flag: If non-zero, skip computation
-        tensor_lists: [grads, params (bf16), exp_avgs (fp32), exp_avg_sqs (fp32), param_remainders (int16)]
+        tensor_lists: [grads, params (int16/bf16), exp_avgs (fp32), exp_avg_sqs (fp32), param_remainders (int16)]
         lr: Learning rate
         beta1: First moment decay rate
         beta2: Second moment decay rate
-        eps: Epsilon for numerical stability
+        epsilon: Epsilon for numerical stability
         step: Current optimization step
         mode: 0 = L2 regularization, 1 = AdamW (decoupled weight decay)
         bias_correction: Whether to apply bias correction (1 = yes, 0 = no)
@@ -166,61 +230,76 @@ def multi_tensor_adam_param_remainder_torch(
         bias_correction1 = 1.0
         bias_correction2 = 1.0
 
+    is_adamw = mode == 1
+
     for grad, param, exp_avg, exp_avg_sq, param_remainder in zip(
         grads, params, exp_avgs, exp_avg_sqs, param_remainders
     ):
-        if grad is None:
-            continue
+        # Convert gradient to float
+        g_float = grad.float()
 
-        # Reconstruct FP32 master weight from BF16 param + int16 remainder
-        # The CUDA implementation uses bit manipulation to combine them
-        # In PyTorch, we approximate this by:
-        # 1. Convert param (bf16) to fp32 - this gives us the high-precision bits
-        # 2. Add the remainder scaled appropriately
-        param_fp32 = param.float()
+        # Reconstruct FP32 master weight from int16 param + int16 remainder using bit manipulation
+        # This matches the CUDA implementation exactly:
+        # 1. If p_remainder < 0, decrement p (undo rounding)
+        # 2. Combine high 16 bits (p) and low 16 bits (p_remainder) into FP32
 
-        # The remainder represents the lower 16 bits lost in BF16 conversion
-        # We need to scale it back to the proper magnitude
-        # BF16 has 16 bits total (1 sign, 8 exponent, 7 mantissa)
-        # The remainder compensates for the lost precision
-        param_master = param_fp32 + param_remainder.float() * (2.0**-16)
+        local_p = param.view(torch.int16).clone()
+        local_p_rem = param_remainder.clone()
 
-        # Standard Adam update on FP32 master weight
-        if mode == 0:  # L2 regularization
-            grad_with_decay = grad.float() + weight_decay * param_master
-        else:  # mode == 1, AdamW
-            grad_with_decay = grad.float()
+        # Undo rounding: if remainder < 0, decrement p
+        local_p = torch.where(local_p_rem < 0, local_p - 1, local_p)
+
+        # Combine into FP32 using bit shift operations
+        # local_p is high 16 bits, local_p_rem is low 16 bits
+        high_bits = local_p.to(torch.int32) << 16
+        low_bits = local_p_rem.to(torch.int32) & 0xFFFF  # Mask off sign extension
+        param_int32 = high_bits | low_bits
+        param_master = param_int32.view(torch.float32)
+
+        # L2 mode: add weight decay to gradient before updating moments
+        if not is_adamw and weight_decay != 0:
+            g_float = g_float + weight_decay * param_master
 
-        # Update moments
-        exp_avg.mul_(beta1).add_(grad_with_decay, alpha=1 - beta1)
-        exp_avg_sq.mul_(beta2).addcmul_(grad_with_decay, grad_with_decay, value=1 - beta2)
+        # Update first moment: m = beta1 * m + (1 - beta1) * g
+        exp_avg.mul_(beta1).add_(g_float, alpha=1 - beta1)
+
+        # Update second moment: v = beta2 * v + (1 - beta2) * g^2
+        exp_avg_sq.mul_(beta2).addcmul_(g_float, g_float, value=1 - beta2)
 
         # Apply bias correction
-        corrected_exp_avg = exp_avg / bias_correction1
-        corrected_exp_avg_sq = exp_avg_sq / bias_correction2
+        m_corr = exp_avg / bias_correction1
+        v_corr = exp_avg_sq / bias_correction2
+
+        # Compute denominator: sqrt(v_corr) + epsilon
+        denom = torch.sqrt(v_corr) + epsilon
 
         # Compute update
-        denom = corrected_exp_avg_sq.sqrt().add_(eps)
-        update = corrected_exp_avg / denom
+        update = m_corr / denom
 
-        if mode == 1:  # AdamW: apply weight decay directly
+        # AdamW mode: add decoupled weight decay to update
+        if is_adamw and weight_decay != 0:
             update = update + weight_decay * param_master
 
-        # Update master weight
-        param_master.add_(update, alpha=-lr)
+        # Update master weight: p = p - lr * update
+        param_master = param_master - lr * update
+
+        # Split FP32 back into int16 param + int16 remainder using bit manipulation
+        # This matches the CUDA implementation exactly:
+        # 1. Extract high 16 bits as p
+        # 2. Extract low 16 bits as p_remainder
+        # 3. If p_remainder < 0, increment p (round up)
 
-        # Split back into BF16 param + int16 remainder
-        # Convert to BF16 (this is the rounded version)
-        param_bf16 = param_master.to(dtype=param.dtype)
+        param_int32 = param_master.view(torch.int32)
+        # Extract low 16 bits (remainder) and high 16 bits (param)
+        new_p_rem = (param_int32 & 0xFFFF).to(torch.int16)
+        new_p = ((param_int32 >> 16) & 0xFFFF).to(torch.int16)
 
-        # Compute remainder: difference between FP32 master and BF16 representation
-        # Scale and quantize to int16 range
-        remainder_fp32 = (param_master - param_bf16.float()) * (2.0**16)
-        remainder_int16 = remainder_fp32.round().clamp(-32768, 32767).to(dtype=torch.int16)
+        # Round up: if remainder < 0, increment p
+        new_p = torch.where(new_p_rem < 0, new_p + 1, new_p)
 
         # Write back
-        param.copy_(param_bf16)
-        param_remainder.copy_(remainder_int16)
+        param.view(torch.int16).copy_(new_p)
+        param_remainder.copy_(new_p_rem)
 
 
 def multi_tensor_sgd_torch(
diff --git a/transformer_engine/pytorch/optimizers/__init__.py b/transformer_engine/pytorch/optimizers/__init__.py
index a19c797dea..c76f75743d 100644
--- a/transformer_engine/pytorch/optimizers/__init__.py
+++ b/transformer_engine/pytorch/optimizers/__init__.py
@@ -4,6 +4,8 @@
 
 """Fused optimizers and multi-tensor kernels."""
 from transformer_engine_torch import (
+    multi_tensor_scale,
+    multi_tensor_l2norm,
     multi_tensor_unscale_l2norm,
     multi_tensor_adam,
     multi_tensor_adam_fp8,

From acced6d73e6f52e422826bb7141fbfa6b76003d5 Mon Sep 17 00:00:00 2001
From: jiamingwang-mt <jiaming.wang@mthreads.com>
Date: Wed, 11 Mar 2026 11:13:57 +0800
Subject: [PATCH 37/59] tefl musa support (#42)

# Description

Add Musa backend
---
 .../core/backends/vendor/musa/__init__.py     |    7 +
 .../backends/vendor/musa/flash_attention.py   |  127 ++
 .../plugin/core/backends/vendor/musa/musa.py  | 1635 +++++++++++++++++
 .../core/backends/vendor/musa/register_ops.py |  955 ++++++++++
 transformer_engine/plugin/core/builtin_ops.py |    9 +
 5 files changed, 2733 insertions(+)
 create mode 100644 transformer_engine/plugin/core/backends/vendor/musa/__init__.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/musa/musa.py
 create mode 100644 transformer_engine/plugin/core/backends/vendor/musa/register_ops.py

diff --git a/transformer_engine/plugin/core/backends/vendor/musa/__init__.py b/transformer_engine/plugin/core/backends/vendor/musa/__init__.py
new file mode 100644
index 0000000000..a76d0b41fd
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/musa/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from .musa import MUSABackend
+
+__all__ = ["MUSABackend"]
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py
new file mode 100644
index 0000000000..1ef37407d4
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from transformer_engine.plugin.core.ops import FlashAttentionBase
+
+
+class FlashAttentionMUSA(FlashAttentionBase):
+    def __init__(
+        self,
+        softmax_scale: float,
+        attention_dropout: float = 0.0,
+        attention_dropout_ctx: Optional[Callable] = None,
+        attention_type: str = "self",
+        layer_number: Optional[int] = None,
+        deterministic: bool = False,
+    ) -> None:
+        super().__init__(
+            softmax_scale=softmax_scale,
+            attention_dropout=attention_dropout,
+            attention_dropout_ctx=attention_dropout_ctx,
+            attention_type=attention_type,
+            layer_number=layer_number,
+            deterministic=deterministic,
+        )
+
+        # Store initialization parameters for lazy loading
+        self._init_params = {
+            "softmax_scale": softmax_scale,
+            "attention_dropout": attention_dropout,
+            "attention_dropout_ctx": attention_dropout_ctx or nullcontext,
+            "attention_type": attention_type,
+            "layer_number": layer_number,
+            "deterministic": deterministic,
+        }
+        self._musa_flash_attn = None
+
+    def _ensure_musa_flash_attn(self):
+        """Lazy initialization of musa FlashAttention."""
+        if self._musa_flash_attn is not None:
+            return
+
+        try:
+            # Import here to avoid circular dependency issues
+            # transformer_engine_torch must be registered before this import
+            from transformer_engine_musa.pytorch.attention import (
+                FlashAttention as FlashAttentionMusa,
+            )
+
+            if FlashAttentionMusa is None:
+                raise RuntimeError(
+                    "FlashAttention class is None - flash-attn may not be installed correctly"
+                )
+
+            self._musa_flash_attn = FlashAttentionMusa(**self._init_params)
+
+        except ImportError as e:
+            raise RuntimeError(
+                f"Failed to import musa FlashAttention: {e}. "
+                "Please ensure flash-attn is installed and transformer_engine_torch is available."
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to initialize musa FlashAttention: {e}. Init params: {self._init_params}"
+            )
+
+    @property
+    def backend_name(self) -> str:
+        return "musa"
+
+    def _forward_impl(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        qkv_layout: str = "sbh3d",
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        attn_mask_type: str = "causal",
+        window_size: Optional[Tuple[int, int]] = None,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        cp_group: Optional[Any] = None,
+        cp_global_ranks: Optional[List[int]] = None,
+        cp_stream: Optional[torch.musa.Stream] = None,
+        cp_comm_type: str = "p2p",
+        fp8: bool = False,
+        fp8_meta: Optional[Dict[str, Any]] = None,
+        quantizers: Optional[Any] = None,
+        inference_params: Optional[Any] = None,
+        flash_attention_backend: Optional[Any] = None,
+        fp8_output: bool = False,
+    ) -> torch.Tensor:
+        # Ensure musa flash attention is initialized
+        self._ensure_musa_flash_attn()
+
+        return self._musa_flash_attn(
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            qkv_layout=qkv_layout,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_kv=max_seqlen_kv,
+            attn_mask_type=attn_mask_type,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            cp_group=cp_group,
+            cp_global_ranks=cp_global_ranks,
+            cp_stream=cp_stream,
+            cp_comm_type=cp_comm_type,
+            fp8=fp8,
+            fp8_meta=fp8_meta,
+            quantizers=quantizers,
+            inference_params=inference_params,
+            flash_attention_backend=flash_attention_backend,
+            fp8_output=fp8_output,
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/musa.py b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
new file mode 100644
index 0000000000..281b091079
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
@@ -0,0 +1,1635 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+import os
+import sys
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from ....ops import *
+
+
+def _load_musa_libs():
+    import ctypes
+    import os
+    import subprocess
+    from pathlib import Path
+    import importlib.util
+    import sysconfig
+    import platform
+    import glob as glob_module
+
+    def get_ext():
+        system = platform.system()
+        return ".so" if system == "Linux" else ".dylib" if system == "Darwin" else ".dll"
+
+    ext = get_ext()
+
+    def try_load_lib(name, search_patterns):
+        for env_var in [f"{name.upper()}_HOME", f"{name.upper()}_PATH"]:
+            path = os.environ.get(env_var)
+            if path:
+                libs = glob_module.glob(f"{path}/**/lib{name}{ext}*", recursive=True)
+                if libs:
+                    libs.sort(reverse=True, key=os.path.basename)
+                    try:
+                        return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                    except:
+                        pass
+
+        musa_home = os.environ.get("MUSA_HOME") or os.environ.get("MUSA_PATH") or "/usr/local/musa"
+        for pattern in search_patterns:
+            libs = glob_module.glob(f"{musa_home}/**/{pattern}", recursive=True)
+            if libs:
+                libs.sort(reverse=True, key=os.path.basename)
+                try:
+                    return ctypes.CDLL(libs[0], mode=ctypes.RTLD_GLOBAL)
+                except:
+                    pass
+
+        try:
+            result = subprocess.check_output(f"ldconfig -p | grep 'lib{name}{ext}'", shell=True)
+            for line in result.decode().split("\n"):
+                if f"lib{name}" in line and "=>" in line:
+                    so_path = line.split(">")[1].strip()
+                    if so_path:
+                        return ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL)
+        except:
+            pass
+
+        try:
+            return ctypes.CDLL(f"lib{name}{ext}", mode=ctypes.RTLD_GLOBAL)
+        except:
+            return None
+
+    try:
+        import transformer_engine_musa
+
+        return True
+    except Exception as e:
+        print(f"[MUSA] Failed to load MUSA libs: {e}")
+        return False
+
+
+_musa_libs_loaded = False
+
+
+def _ensure_musa_libs():
+    global _musa_libs_loaded
+    if not _musa_libs_loaded:
+        _musa_libs_loaded = _load_musa_libs()
+    return _musa_libs_loaded
+
+
+def _check_musa_available() -> bool:
+    try:
+        if not torch.musa.is_available():
+            return False
+        else:
+            return True
+    except Exception as e:
+        return False
+
+
+def _get_tex():
+    _ensure_musa_libs()
+    import transformer_engine_musa
+    import transformer_engine_musa_torch
+
+    return transformer_engine_musa_torch
+
+
+class MUSABackend(TEFLBackendBase):
+    @staticmethod
+    def check_available() -> bool:
+        return _check_musa_available()
+
+    def __init__(self):
+        self._tex = None
+
+    def _get_tex(self):
+        if self._tex is None:
+            self._tex = _get_tex()
+        return self._tex
+
+    def is_available(self) -> bool:
+        return _check_musa_available()
+
+    def get_attention_backend(self, attention_params=None):
+        """
+        MUSA backend uses the default attention backend selection logic.
+        This allows hardware-specific checks and optimizations for MUSA devices.
+        Returns:
+            Tuple of (use_flash_attention, flash_attention_backend, use_fused_attention,
+                     fused_attention_backend, use_unfused_attention, available_backends)
+        """
+        # Import the original get_attention_backend function
+        from transformer_engine_musa.pytorch.attention import (
+            get_attention_backend as _original_get_attention_backend,
+        )
+
+        return _original_get_attention_backend(attention_params)
+
+    ##### transformer_engine/pytorch/csrc/extensions/pybind.cpp #####
+    def quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        output: Optional[torch.Tensor] = None,
+        noop: Optional[torch.Tensor] = None,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.quantize(tensor, quantizer, output, noop)
+
+    def dequantize(
+        self,
+        input: Any,
+        otype: DType,
+    ) -> Any:
+        tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.dequantize(input, otype)
+
+    def bgrad_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.bgrad_quantize(input, quantizer)
+
+    def generic_gemm(
+        self,
+        A: Any,
+        transA: bool,
+        B: Any,
+        transB: bool,
+        D: Any,
+        quantizer: Any,
+        output_dtype: Optional[DType],
+        bias: Optional[torch.Tensor],
+        bias_type: DType,
+        gelu: bool,
+        gelu_in: Optional[torch.Tensor],
+        grad: bool,
+        workspace: torch.Tensor,
+        workspace_size: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        comm_overlap: Optional[Any] = None,
+        comm_type: Optional[CommOverlapType] = None,
+        extra_output: Optional[torch.Tensor] = None,
+        bulk_overlap: bool = False,
+        alpha: float = 1.0,
+        beta: Optional[float] = None,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        comm_type = tex.CommOverlapType(int(comm_type)) if comm_type is not None else None
+        output_dtype = tex.DType(int(output_dtype)) if output_dtype is not None else None
+        return tex.generic_gemm(
+            A,
+            transA,
+            B,
+            transB,
+            D,
+            quantizer,
+            output_dtype,
+            bias,
+            bias_type,
+            gelu,
+            gelu_in,
+            grad,
+            workspace,
+            workspace_size,
+            accumulate,
+            use_split_accumulator,
+            comm_overlap,
+            comm_type,
+            extra_output,
+            bulk_overlap,
+            alpha,
+            beta,
+        )
+
+    # GELU and variants #
+    def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.gelu(input, quantizer)
+
+    def geglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.geglu(input, quantizer)
+
+    def qgelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgelu(input, quantizer)
+
+    def qgeglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.qgeglu(input, quantizer)
+
+    # ReLU and variants #
+    def relu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.relu(input, quantizer)
+
+    def reglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.reglu(input, quantizer)
+
+    def srelu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.srelu(input, quantizer)
+
+    def sreglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.sreglu(input, quantizer)
+
+    # SwiGLU and variants #
+    def silu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.silu(input, quantizer)
+
+    def swiglu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.swiglu(input, quantizer)
+
+    def clamped_swiglu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_swiglu(input, quantizer, limit, alpha)
+
+    # Backward of GELU and variants #
+    def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgelu(grad, fwd_input, quantizer)
+
+    def dgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dgeglu(grad, fwd_input, quantizer)
+
+    def dqgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgelu(grad, fwd_input, quantizer)
+
+    def dqgeglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dqgeglu(grad, fwd_input, quantizer)
+
+    # Backward of ReLU and variants #
+    def drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.drelu(grad, fwd_input, quantizer)
+
+    def dreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dreglu(grad, fwd_input, quantizer)
+
+    def dsrelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsrelu(grad, fwd_input, quantizer)
+
+    def dsreglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsreglu(grad, fwd_input, quantizer)
+
+    # Backward of SiLU and variants #
+    def dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dsilu(grad, fwd_input, quantizer)
+
+    def dswiglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dswiglu(grad, fwd_input, quantizer)
+
+    def clamped_dswiglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+        limit: float = 7.0,
+        alpha: float = 1.702,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.clamped_dswiglu(grad, fwd_input, quantizer, limit, alpha)
+
+    # DBias + DAct fusions #
+    def dbias_dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+        tex = self._get_tex()
+        return tex.dbias_dgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsilu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsilu(grad, fwd_input, quantizer)
+
+    def dbias_drelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> List[Any]:
+        tex = self._get_tex()
+        return tex.dbias_drelu(grad, fwd_input, quantizer)
+
+    def dbias_dqgelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.dbias_dqgelu(grad, fwd_input, quantizer)
+
+    def dbias_dsrelu(
+        self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.dbias_dsrelu(grad, fwd_input, quantizer)
+
+    # Permutation functions
+    def moe_permute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        indices: torch.Tensor,
+        num_out_tokens: int,
+        workspace: List[torch.Tensor],
+        max_expanded_token_num: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_fwd(
+            input, dtype, indices, num_out_tokens, workspace, max_expanded_token_num
+        )
+
+    def moe_permute_bwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_permute_bwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
+    def moe_unpermute_fwd(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+        num_tokens: int,
+        topK: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_fwd(input, dtype, row_id_map, prob, num_tokens, topK)
+
+    def moe_unpermute_bwd(
+        self,
+        input_bwd: torch.Tensor,
+        input_fwd: torch.Tensor,
+        dtype: DType,
+        row_id_map: torch.Tensor,
+        prob: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.moe_unpermute_bwd(input_bwd, input_fwd, dtype, row_id_map, prob)
+
+    # Softmax functions
+    def scaled_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_forward(input, scale)
+
+    def scaled_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_forward(input, mask, scale_factor)
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_masked_softmax_backward(output_grad_, softmax_results_, scale_factor)
+
+    def scaled_upper_triang_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_forward(input, scale_factor)
+
+    def scaled_upper_triang_masked_softmax_backward(
+        self,
+        output_grads_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_upper_triang_masked_softmax_backward(
+            output_grads_, softmax_results_, scale_factor
+        )
+
+    def scaled_aligned_causal_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_forward(input, scale_factor)
+
+    def scaled_aligned_causal_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.scaled_aligned_causal_masked_softmax_backward(
+            output_grad_, softmax_results_, scale_factor
+        )
+
+    # Other granular functions
+    def layernorm_fwd(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        eps: float,
+        ln_out: Any,
+        quantizer: Any,
+        otype: DType,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
+        tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.layernorm_fwd(
+            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+    def layernorm_bwd(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.layernorm_bwd(dz, x, mu, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+    def rmsnorm_fwd(
+        self,
+        input: Any,
+        weight: Any,
+        eps: float,
+        ln_out: Any,
+        quantizer: Any,
+        otype: DType,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
+        tex = self._get_tex()
+        otype = tex.DType(int(otype)) if otype is not None else None
+        return tex.rmsnorm_fwd(
+            input, weight, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+        )
+
+    def rmsnorm_bwd(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.rmsnorm_bwd(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+    def rmsnorm_bwd_add(
+        self,
+        dz: torch.Tensor,
+        x: torch.Tensor,
+        add: torch.Tensor,
+        rsigma: torch.Tensor,
+        gamma: torch.Tensor,
+        sm_margin: int,
+        zero_centered_gamma: bool,
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.rmsnorm_bwd_add(dz, x, add, rsigma, gamma, sm_margin, zero_centered_gamma)
+
+    def multi_tensor_quantize(
+        self,
+        tensor_list: List[torch.Tensor],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.multi_tensor_quantize(tensor_list, quantizer_list)
+
+    def split_quantize(
+        self,
+        tensor: torch.Tensor,
+        split_sections: List[int],
+        quantizer_list: List[Any],
+    ) -> List[Any]:
+        tex = self._get_tex()
+        return tex.split_quantize(tensor, split_sections, quantizer_list)
+
+    def te_general_grouped_gemm(
+        self,
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
+        tex = self._get_tex()
+        D_type = tex.DType(int(D_type)) if D_type is not None else None
+        bias_type = tex.DType(int(bias_type)) if bias_type is not None else None
+        return tex.te_general_grouped_gemm(
+            A,
+            transa,
+            B,
+            transb,
+            D,
+            D_type,
+            m_splits,
+            bias,
+            bias_type,
+            single_output,
+            pre_gelu_out,
+            grad,
+            workspace,
+            workspaceSizes,
+            accumulate,
+            use_split_accumulator,
+            math_sm_count,
+        )
+
+    def fp8_transpose(
+        self,
+        input: torch.Tensor,
+        dtype: DType,
+        out: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        dtype = tex.DType(int(dtype)) if dtype is not None else None
+        return tex.fp8_transpose(input, dtype, out)
+
+    def swap_first_dims(
+        self,
+        tensor: torch.Tensor,
+        out: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.swap_first_dims(tensor, out)
+
+    def get_fused_attn_backend(
+        self,
+        is_training: bool,
+        q_dtype: DType,
+        kv_dtype: DType,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        p_dropout: float,
+        num_attn_heads: int,
+        num_gqa_groups: int,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        head_dim_qk: int,
+        head_dim_v: int,
+        window_size_left: int,
+        window_size_right: int,
+        return_max_logit: bool,
+    ) -> NVTE_Fused_Attn_Backend:
+        tex = self._get_tex()
+
+        q_dtype = tex.DType(int(q_dtype)) if q_dtype is not None else None
+        kv_dtype = tex.DType(int(kv_dtype)) if kv_dtype is not None else None
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
+
+        result = tex.get_fused_attn_backend(
+            is_training,
+            q_dtype,
+            kv_dtype,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            p_dropout,
+            num_attn_heads,
+            num_gqa_groups,
+            max_seqlen_q,
+            max_seqlen_kv,
+            head_dim_qk,
+            head_dim_v,
+            window_size_left,
+            window_size_right,
+            return_max_logit,
+        )
+        return NVTE_Fused_Attn_Backend(result)
+
+    def compute_amax(
+        self,
+        input: torch.Tensor,
+        amax: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.compute_amax(input, amax)
+
+    def fused_amax_and_scale_update_after_reduction(
+        self,
+        amax_reduction_buffer: torch.Tensor,
+        amax_histories: List[torch.Tensor],
+        scales: List[torch.Tensor],
+        amax_compute_algo: str,
+        fp8_dtype: DType,
+        margin: float,
+    ) -> None:
+        tex = self._get_tex()
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.fused_amax_and_scale_update_after_reduction(
+            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
+        )
+
+    def fp8_block_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.fp8_block_scaling_compute_partial_amax(
+            tensor, amax, h, w, start_offset, block_len
+        )
+
+    def fp8_block_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
+        tex = self._get_tex()
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.fp8_block_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+
+    def fused_multi_row_padding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        padded_input_row_list: List[int],
+    ) -> None:
+        tex = self._get_tex()
+        return tex.fused_multi_row_padding(input, output, input_row_list, padded_input_row_list)
+
+    def fused_multi_row_unpadding(
+        self,
+        input: torch.Tensor,
+        output: torch.Tensor,
+        input_row_list: List[int],
+        unpadded_input_row_list: List[int],
+    ) -> None:
+        tex = self._get_tex()
+        return tex.fused_multi_row_unpadding(input, output, input_row_list, unpadded_input_row_list)
+
+    # attention kernels
+    def fa_prepare_fwd(
+        self,
+        qkvi: torch.Tensor,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.fa_prepare_fwd(qkvi)
+
+    def fa_prepare_bwd(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.fa_prepare_bwd(q, k, v)
+
+    def fused_attn_fwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        is_training: bool,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        fake_dtype: torch.dtype,
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        page_table_k: Optional[torch.Tensor],
+        page_table_v: Optional[torch.Tensor],
+        s_quantizer: Any,
+        o_quantizer: Any,
+        Bias: Optional[torch.Tensor],
+        SoftmaxOffset: Optional[torch.Tensor],
+        rng_gen: Optional[torch.Generator],
+        rng_elts_per_thread: int,
+        return_max_logit: bool,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
+
+        return tex.fused_attn_fwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            is_training,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            fake_dtype,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            page_table_k,
+            page_table_v,
+            s_quantizer,
+            o_quantizer,
+            Bias,
+            SoftmaxOffset,
+            rng_gen,
+            rng_elts_per_thread,
+            return_max_logit,
+        )
+
+    def fused_attn_bwd(
+        self,
+        max_seqlen_q: int,
+        max_seqlen_kv: int,
+        attn_scale: float,
+        p_dropout: float,
+        set_zero: bool,
+        qkv_layout: NVTE_QKV_Layout,
+        bias_type: NVTE_Bias_Type,
+        attn_mask_type: NVTE_Mask_Type,
+        softmax_type: NVTE_Softmax_Type,
+        window_size: List[int],
+        deterministic: bool,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_kv: torch.Tensor,
+        Q: Any,
+        K: Any,
+        V: Any,
+        O: Any,
+        dO: Any,
+        fake_dtype: torch.dtype,
+        dqkv_type: DType,
+        Aux_CTX_Tensors: List[torch.Tensor],
+        cu_seqlens_q_padded: Optional[torch.Tensor],
+        cu_seqlens_kv_padded: Optional[torch.Tensor],
+        s_quantizer: Any,
+        dp_quantizer: Any,
+        dqkv_quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        qkv_layout = tex.NVTE_QKV_Layout(int(qkv_layout)) if qkv_layout is not None else None
+        bias_type = tex.NVTE_Bias_Type(int(bias_type)) if bias_type is not None else None
+        attn_mask_type = (
+            tex.NVTE_Mask_Type(int(attn_mask_type)) if attn_mask_type is not None else None
+        )
+        softmax_type = (
+            tex.NVTE_Softmax_Type(int(softmax_type)) if softmax_type is not None else None
+        )
+        dqkv_type = tex.DType(int(dqkv_type)) if dqkv_type is not None else None
+
+        return tex.fused_attn_bwd(
+            max_seqlen_q,
+            max_seqlen_kv,
+            attn_scale,
+            p_dropout,
+            set_zero,
+            qkv_layout,
+            bias_type,
+            attn_mask_type,
+            softmax_type,
+            window_size,
+            deterministic,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            Q,
+            K,
+            V,
+            O,
+            dO,
+            fake_dtype,
+            dqkv_type,
+            Aux_CTX_Tensors,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+            s_quantizer,
+            dp_quantizer,
+            dqkv_quantizer,
+        )
+
+    def copy_to_kv_cache(
+        self,
+        new_k: torch.Tensor,
+        new_v: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        page_table: torch.Tensor,
+        cu_new_lens: torch.Tensor,
+        cu_cached_lens: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        b: int,
+        max_ctx_len: int,
+        max_seq_len: int,
+        max_pages_per_seq: int,
+        is_non_paged: bool,
+    ) -> None:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.copy_to_kv_cache(
+            new_k,
+            new_v,
+            k_cache,
+            v_cache,
+            page_table,
+            cu_new_lens,
+            cu_cached_lens,
+            qkv_format,
+            b,
+            max_ctx_len,
+            max_seq_len,
+            max_pages_per_seq,
+            is_non_paged,
+        )
+
+    def convert_thd_to_bshd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        b: int,
+        max_seq_len: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.convert_thd_to_bshd(tensor, cu_seqlens, b, max_seq_len)
+
+    def convert_bshd_to_thd(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        t: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.convert_bshd_to_thd(tensor, cu_seqlens, t)
+
+    # fused apply rope
+    def fused_rope_forward(
+        self,
+        input: torch.Tensor,
+        freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_forward(
+            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+        )
+
+    def fused_rope_backward(
+        self,
+        output_grads: torch.Tensor,
+        freqs: torch.Tensor,
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cu_seqlens: Optional[torch.Tensor],
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_rope_backward(
+            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+        )
+
+    def fused_qkv_rope_forward(
+        self,
+        qkv_input: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_forward(
+            qkv_input,
+            q_freqs,
+            k_freqs,
+            start_positions,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
+        )
+
+    def fused_qkv_rope_backward(
+        self,
+        q_grad_out: torch.Tensor,
+        k_grad_out: torch.Tensor,
+        v_grad_out: torch.Tensor,
+        q_freqs: torch.Tensor,
+        k_freqs: torch.Tensor,
+        qkv_split_arg_list: List[int],
+        qkv_format: NVTE_QKV_Format,
+        interleaved: bool,
+        cp_size: int,
+        cp_rank: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
+        return tex.fused_qkv_rope_backward(
+            q_grad_out,
+            k_grad_out,
+            v_grad_out,
+            q_freqs,
+            k_freqs,
+            qkv_split_arg_list,
+            qkv_format,
+            interleaved,
+            cp_size,
+            cp_rank,
+        )
+
+    # fused router
+    def fused_topk_with_score_function_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        num_groups: Optional[int],
+        group_topk: Optional[int],
+        scaling_factor: Optional[float],
+        score_function: str,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_fwd(
+            logits,
+            topk,
+            use_pre_softmax,
+            num_groups,
+            group_topk,
+            scaling_factor,
+            score_function,
+            expert_bias,
+        )
+
+    def fused_topk_with_score_function_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        routing_map: torch.Tensor,
+        intermediate_output: torch.Tensor,
+        grad_probs: torch.Tensor,
+        topk: int,
+        use_pre_softmax: bool,
+        scaling_factor: Optional[float],
+        score_function: str,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.fused_topk_with_score_function_bwd(
+            num_tokens,
+            num_experts,
+            routing_map,
+            intermediate_output,
+            grad_probs,
+            topk,
+            use_pre_softmax,
+            scaling_factor,
+            score_function,
+        )
+
+    def fused_score_for_moe_aux_loss_fwd(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        score_function: str,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_fwd(
+            logits,
+            topk,
+            score_function,
+        )
+
+    def fused_score_for_moe_aux_loss_bwd(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        intermediate_output: torch.Tensor,
+        grad_scores: torch.Tensor,
+        topk: int,
+        score_function: str,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.fused_score_for_moe_aux_loss_bwd(
+            num_tokens,
+            num_experts,
+            intermediate_output,
+            grad_scores,
+            topk,
+            score_function,
+        )
+
+    def fused_moe_aux_loss_fwd(
+        self,
+        probs: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        total_num_tokens: int,
+        num_experts: int,
+        num_rows: int,
+        num_cols: int,
+        topk: int,
+        coeff: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_fwd(
+            probs,
+            tokens_per_expert,
+            total_num_tokens,
+            num_experts,
+            num_rows,
+            num_cols,
+            topk,
+            coeff,
+        )
+
+    def fused_moe_aux_loss_bwd(
+        self,
+        Const_buf: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        num_rows: int,
+        num_cols: int,
+        grad_aux_loss: torch.Tensor,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.fused_moe_aux_loss_bwd(
+            Const_buf, tokens_per_expert, num_rows, num_cols, grad_aux_loss
+        )
+
+    # Dropout
+    def dropout_fwd(
+        self,
+        input: torch.Tensor,
+        dropout_probability: float,
+        out: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.dropout_fwd(input, dropout_probability, out)
+
+    def dropout_bwd(
+        self,
+        grad_output: torch.Tensor,
+        mask: torch.Tensor,
+        dropout_probability: float,
+        grad_input: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.dropout_bwd(grad_output, mask, dropout_probability, grad_input)
+
+    # Misc
+    def get_cublasLt_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cublasLt_version()
+
+    def get_cudnn_version(self) -> int:
+        tex = self._get_tex()
+        return tex.get_cudnn_version()
+
+    def get_num_cublas_streams(self) -> int:
+        tex = self._get_tex()
+        return tex.get_num_cublas_streams()
+
+    # Support THD format for Context Parallel
+    def thd_read_half_tensor(
+        self,
+        tensor: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        half_idx: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.thd_read_half_tensor(tensor, cu_seqlens, half_idx)
+
+    def thd_second_half_lse_correction(
+        self,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.thd_second_half_lse_correction(lse, lse_per_step, cu_seqlens, lse_packed)
+
+    def thd_read_second_half_lse(
+        self,
+        lse: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        lse_packed: bool,
+        second_half_lse_seqlen: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.thd_read_second_half_lse(lse, cu_seqlens, lse_packed, second_half_lse_seqlen)
+
+    def thd_out_correction(
+        self,
+        out: torch.Tensor,
+        out_per_step: torch.Tensor,
+        lse: torch.Tensor,
+        lse_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        only_second_half: bool,
+        lse_packed: bool,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.thd_out_correction(
+            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
+        )
+
+    def thd_grad_correction(
+        self,
+        grad: torch.Tensor,
+        grad_per_step: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        first_half: str,
+        second_half: str,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.thd_grad_correction(grad, grad_per_step, cu_seqlens, first_half, second_half)
+
+    def thd_get_partitioned_indices(
+        self,
+        cu_seqlens: torch.Tensor,
+        total_tokens: int,
+        world_size: int,
+        rank: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, world_size, rank)
+
+    # nvshmem functions
+    def init_nvshmem_backend(
+        self,
+        process_group: Any,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.init_nvshmem_backend(process_group)
+
+    def create_nvshmem_tensor(
+        self,
+        shape: List[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.create_nvshmem_tensor(shape, dtype)
+
+    def nvshmem_send_on_current_stream(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        peer: int,
+        signal: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvshmem_send_on_current_stream(src, dst, peer, signal)
+
+    def nvshmem_wait_on_current_stream(
+        self,
+        signal: torch.Tensor,
+        wait_kind: str,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvshmem_wait_on_current_stream(signal, wait_kind)
+
+    def nvshmem_finalize(self) -> None:
+        tex = self._get_tex()
+        return tex.nvshmem_finalize()
+
+    # multi-tensor functions
+    def multi_tensor_scale(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
+
+    def multi_tensor_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.multi_tensor_l2norm(chunk_size, noop_flag, tensor_lists, per_tensor)
+
+    def multi_tensor_unscale_l2norm(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        inv_scale: torch.Tensor,
+        per_tensor: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tex = self._get_tex()
+        return tex.multi_tensor_unscale_l2norm(
+            chunk_size, noop_flag, tensor_lists, inv_scale, per_tensor
+        )
+
+    def multi_tensor_adam(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam(
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+        )
+
+    def multi_tensor_adam_param_remainder(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam_param_remainder(
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+        )
+
+    def multi_tensor_adam_fp8(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: float,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: int,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        fp8_dtype: DType,
+    ) -> None:
+        tex = self._get_tex()
+        fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
+        return tex.multi_tensor_adam_fp8(
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            fp8_dtype,
+        )
+
+    def multi_tensor_adam_capturable(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam_capturable(
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
+        )
+
+    def multi_tensor_adam_capturable_master(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        lr: torch.Tensor,
+        beta1: float,
+        beta2: float,
+        epsilon: float,
+        step: torch.Tensor,
+        mode: int,
+        bias_correction: int,
+        weight_decay: float,
+        inv_scale: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_adam_capturable_master(
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            lr,
+            beta1,
+            beta2,
+            epsilon,
+            step,
+            mode,
+            bias_correction,
+            weight_decay,
+            inv_scale,
+        )
+
+    def multi_tensor_sgd(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        wd: float,
+        momentum: float,
+        dampening: float,
+        lr: float,
+        nesterov: bool,
+        first_run: bool,
+        wd_after_momentum: bool,
+        scale: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_sgd(
+            chunk_size,
+            noop_flag,
+            tensor_lists,
+            wd,
+            momentum,
+            dampening,
+            lr,
+            nesterov,
+            first_run,
+            wd_after_momentum,
+            scale,
+        )
+
+    def multi_tensor_compute_scale_and_scale_inv(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        max_fp8: float,
+        force_pow_2_scales: bool,
+        epsilon: float,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_compute_scale_and_scale_inv(
+            chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
+        )
+
+    # Comm+GEMM Overlap
+    def bulk_overlap_ag_with_external_gemm(
+        self,
+        allgather_communicator: CommOverlap,
+        send_stream: Any,
+        recv_stream: Any,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.bulk_overlap_ag_with_external_gemm(
+            allgather_communicator, send_stream, recv_stream
+        )
+
+    ############## class func #################################
+    def get_flash_attention_class(self):
+        from .flash_attention import FlashAttentionMusa
+
+        return FlashAttentionMusa
+
+    def create_fp8_tensor_meta(self) -> FP8TensorMeta:
+        tex = self._get_tex()
+        return tex.FP8TensorMeta()
+
+    def create_comm_overlap_helper(
+        self,
+        world_group: Optional[Any] = None,
+        intra_node_group: Optional[Any] = None,
+    ) -> "CommOverlapHelper":
+        tex = self._get_tex()
+        return tex.CommOverlapHelper(world_group, intra_node_group)
+
+    def create_comm_overlap(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        num_splits: int = 3,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 2,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 16,
+        set_sm_margin: bool = True,
+        atomic_gemm: bool = False,
+        rs_overlap_first_gemm: bool = False,
+    ) -> "CommOverlap":
+        tex = self._get_tex()
+        return tex.CommOverlap(
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            num_splits,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            rs_overlap_first_gemm,
+        )
+
+    def create_comm_overlap_p2p(
+        self,
+        buffer_shape: List[int],
+        buffer_dtype: torch.dtype,
+        helper: Any,
+        tp_size: int,
+        comm_type: Any,
+        num_max_streams: int = 3,
+        comm_cga_size: int = 1,
+        gemm_priority: int = 0,
+        comm_priority: int = 0,
+        num_comm_sm: int = 1,
+        set_sm_margin: bool = False,
+        atomic_gemm: bool = False,
+        use_ce: bool = True,
+        aggregate: bool = False,
+    ) -> "CommOverlapP2P":
+        tex = self._get_tex()
+        return tex.CommOverlapP2P(
+            buffer_shape,
+            buffer_dtype,
+            helper,
+            tp_size,
+            comm_type,
+            num_max_streams,
+            comm_cga_size,
+            gemm_priority,
+            comm_priority,
+            num_comm_sm,
+            set_sm_margin,
+            atomic_gemm,
+            use_ce,
+            aggregate,
+        )
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py b/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py
new file mode 100644
index 0000000000..7027188369
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py
@@ -0,0 +1,955 @@
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""
+MUSA vendor backend operator registrations.
+
+This module registers all VENDOR (MUSA) implementations from transformer_engine_torch.
+"""
+
+from __future__ import annotations
+
+import functools
+
+from ....types import OpImpl, BackendImplKind
+
+
+def _bind_is_available(fn, is_available_fn):
+    """Wrap a function and bind _is_available attribute for OpImpl.is_available() check."""
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    wrapper._is_available = is_available_fn
+    return wrapper
+
+
+def register_builtins(registry) -> None:
+    """
+    Register all MUSA (VENDOR) operator implementations.
+
+    Args:
+        registry: Registry to register into
+    """
+    # Import MUSA backend to get all the wrapped tex functions
+    from .musa import MUSABackend
+
+    # Create a backend instance to access the methods
+    backend = MUSABackend()
+
+    # Check if MUSA is available before registering
+    if not backend.is_available():
+        return
+
+    # Bind is_available to all methods
+    is_avail = backend.is_available
+
+    impls = [
+        # Normalization
+        OpImpl(
+            op_name="rmsnorm_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="rmsnorm_bwd_add",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.rmsnorm_bwd_add, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="layernorm_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.layernorm_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # GEMM
+        OpImpl(
+            op_name="generic_gemm",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.generic_gemm, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Quantization
+        OpImpl(
+            op_name="quantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.quantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dequantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dequantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_quantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_quantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="split_quantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.split_quantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Activations - Forward
+        OpImpl(
+            op_name="gelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.gelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="geglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.geglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="qgeglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.qgeglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="relu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.relu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="reglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.reglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="srelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.srelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="sreglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.sreglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="silu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.silu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swiglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swiglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_swiglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_swiglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Activations - Backward
+        OpImpl(
+            op_name="dgelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dgeglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dgeglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dqgeglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dqgeglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="drelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.drelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dreglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dreglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsrelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsrelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsreglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsreglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dsilu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dsilu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dswiglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dswiglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="clamped_dswiglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.clamped_dswiglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Activations - Bias + Backward
+        OpImpl(
+            op_name="dbias_dgelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dgelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsilu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsilu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_drelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_drelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dqgelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dqgelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dbias_dsrelu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dbias_dsrelu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Softmax
+        OpImpl(
+            op_name="scaled_softmax_forward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_forward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_softmax_backward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_softmax_backward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_forward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_forward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_upper_triang_masked_softmax_backward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_upper_triang_masked_softmax_backward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_forward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_forward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="scaled_aligned_causal_masked_softmax_backward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.scaled_aligned_causal_masked_softmax_backward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # MOE operations
+        OpImpl(
+            op_name="moe_permute_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_permute_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_permute_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="moe_unpermute_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.moe_unpermute_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Fused attention
+        OpImpl(
+            op_name="get_fused_attn_backend",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_fused_attn_backend, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_attn_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_attn_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fa_prepare_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fa_prepare_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # KV cache
+        OpImpl(
+            op_name="copy_to_kv_cache",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.copy_to_kv_cache, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Tensor format conversions
+        OpImpl(
+            op_name="convert_thd_to_bshd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_thd_to_bshd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_bshd_to_thd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_bshd_to_thd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # RoPE (Rotary Position Embedding)
+        OpImpl(
+            op_name="fused_rope_forward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_forward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_rope_backward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_rope_backward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_forward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_forward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_qkv_rope_backward",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_qkv_rope_backward, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # TopK and MOE aux loss
+        OpImpl(
+            op_name="fused_topk_with_score_function_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_topk_with_score_function_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_topk_with_score_function_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_score_for_moe_aux_loss_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_score_for_moe_aux_loss_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_moe_aux_loss_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_moe_aux_loss_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Dropout
+        OpImpl(
+            op_name="dropout_fwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_fwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="dropout_bwd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dropout_bwd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # FP8 operations
+        OpImpl(
+            op_name="fp8_transpose",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_transpose, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swap_first_dims",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swap_first_dims, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="compute_amax",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.compute_amax, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_amax_and_scale_update_after_reduction",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_amax_and_scale_update_after_reduction, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_compute_partial_amax",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_compute_partial_amax, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fp8_block_scaling_partial_cast",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fp8_block_scaling_partial_cast, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Padding operations
+        OpImpl(
+            op_name="fused_multi_row_padding",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_padding, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="fused_multi_row_unpadding",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.fused_multi_row_unpadding, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Library version getters
+        OpImpl(
+            op_name="get_cublasLt_version",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cublasLt_version, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # THD (Tensor, Hidden, Dimension) operations
+        OpImpl(
+            op_name="thd_read_half_tensor",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_half_tensor, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_second_half_lse_correction",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_second_half_lse_correction, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_read_second_half_lse",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_read_second_half_lse, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_out_correction",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_out_correction, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_grad_correction",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_grad_correction, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="thd_get_partitioned_indices",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.thd_get_partitioned_indices, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # NVSHMEM operations
+        OpImpl(
+            op_name="init_nvshmem_backend",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.init_nvshmem_backend, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_nvshmem_tensor",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_nvshmem_tensor, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_send_on_current_stream",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_send_on_current_stream, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_wait_on_current_stream",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_wait_on_current_stream, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvshmem_finalize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvshmem_finalize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Multi-tensor operations
+        OpImpl(
+            op_name="multi_tensor_quantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_quantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_scale",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_l2norm",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_l2norm, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_unscale_l2norm",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_unscale_l2norm, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_param_remainder",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_param_remainder, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_fp8",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_fp8, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_adam_capturable_master",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_adam_capturable_master, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_sgd",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_sgd, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_and_scale_inv",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_and_scale_inv, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Communication overlap operations
+        OpImpl(
+            op_name="bulk_overlap_ag_with_external_gemm",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bulk_overlap_ag_with_external_gemm, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_fp8_tensor_meta",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_fp8_tensor_meta, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_helper",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_helper, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="create_comm_overlap_p2p",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.create_comm_overlap_p2p, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # FlashAttention class getter
+        OpImpl(
+            op_name="get_flash_attention_class",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_flash_attention_class, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        # Attention backend selection
+        OpImpl(
+            op_name="get_attention_backend",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_attention_backend, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+    ]
+
+    registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/builtin_ops.py b/transformer_engine/plugin/core/builtin_ops.py
index c194a543f3..c991d4fc51 100644
--- a/transformer_engine/plugin/core/builtin_ops.py
+++ b/transformer_engine/plugin/core/builtin_ops.py
@@ -86,3 +86,12 @@ def register_builtins(registry: OpRegistry) -> None:
     except Exception as e:
         # Iluvatar may not be available, this is expected
         pass
+
+    # Register MUSA (VENDOR) implementations
+    try:
+        from .backends.vendor.musa.register_ops import register_builtins as register_musa
+
+        register_musa(registry)
+    except Exception as e:
+        # MUSA may not be available, this is expected
+        pass

From 4f54860a4a14d731da96850fd62177bf486115fa Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Mon, 23 Mar 2026 14:53:50 +0800
Subject: [PATCH 38/59] Add python-level patches to supporting multiple
 platforms (#49)

TE-FL Python-level now supports multiple platforms, including the
following two changes:

1. support for vendor-specific patches: vendors can now add their own
patches, e.g., patching ```torch.cuda``` to ```torch.musa```. For patch
implementation, please refer to
```transformer_engine/plugin/core/backends/vendor/musa/musa_patches.py```;
for patch integration, please refer to
```transformer_engine/__init__.py```.

2. abstraction of CUDA device references: files under
```transformer_engine/``` now abstract CUDA device-related code into
```te_device_type```. For example, ```torch.device("cuda")``` is now
replaced with ```torch.device(te_device_type)```.

3. Fix

- FlagOS Backend: ```get_num_cublas_stream``` and
```get_cudnn_version```
- Reference Backend: ```get_num_cublas_stream``` and
```scaled_mask_softmax_forward```
---
 transformer_engine/__init__.py                | 31 ++++++++
 .../debug/features/fake_quant.py              |  5 +-
 .../debug/features/log_fp8_tensor_stats.py    |  6 +-
 .../debug/features/per_tensor_scaling.py      |  6 +-
 .../debug/features/utils/stats_buffer.py      |  5 +-
 .../dot_product_attention/backends.py         |  7 +-
 .../plugin/core/backends/flagos/flagos.py     |  2 +-
 .../core/backends/flagos/register_ops.py      | 16 +++++
 .../core/backends/reference/impl/softmax.py   | 43 +++++++----
 .../core/backends/reference/reference.py      |  2 +-
 .../core/backends/vendor/musa/patches.py      | 72 +++++++++++++++++++
 .../dot_product_attention/backends.py         | 17 +++--
 .../dot_product_attention.py                  | 13 ++--
 .../dot_product_attention/softmax.py          |  3 +-
 .../attention/dot_product_attention/utils.py  | 32 ++++++---
 .../pytorch/attention/inference.py            |  3 +-
 .../pytorch/attention/multi_head_attention.py |  3 +-
 transformer_engine/pytorch/attention/rope.py  |  3 +-
 .../pytorch/cpp_extensions/gemm.py            |  6 +-
 transformer_engine/pytorch/distributed.py     | 18 ++---
 transformer_engine/pytorch/jit.py             | 20 ++++--
 transformer_engine/pytorch/module/base.py     | 26 ++++---
 .../pytorch/module/grouped_linear.py          |  5 +-
 .../pytorch/module/layernorm_mlp.py           |  7 +-
 .../pytorch/ops/basic/activation.py           |  6 +-
 .../pytorch/ops/basic/basic_linear.py         |  4 +-
 transformer_engine/pytorch/ops/basic/bias.py  |  5 +-
 .../fused/forward_linear_bias_activation.py   |  4 +-
 .../ops/fused/forward_linear_bias_add.py      |  6 +-
 .../ops/fused/forward_linear_scale_add.py     |  4 +-
 .../ops/fused/userbuffers_backward_linear.py  |  5 +-
 .../ops/fused/userbuffers_forward_linear.py   |  5 +-
 .../pytorch/optimizers/fused_adam.py          |  3 +-
 transformer_engine/pytorch/permutation.py     | 50 +++++++++----
 transformer_engine/pytorch/quantization.py    | 18 +++--
 .../pytorch/tensor/float8_blockwise_tensor.py |  7 +-
 .../pytorch/tensor/float8_tensor.py           |  7 +-
 .../pytorch/tensor/mxfp8_tensor.py            |  5 +-
 .../pytorch/tensor/nvfp4_tensor.py            |  7 +-
 transformer_engine/pytorch/transformer.py     |  3 +-
 .../pytorch/triton/permutation.py             | 27 +++----
 transformer_engine/pytorch/utils.py           | 21 +++---
 42 files changed, 399 insertions(+), 139 deletions(-)
 create mode 100644 transformer_engine/plugin/core/backends/vendor/musa/patches.py

diff --git a/transformer_engine/__init__.py b/transformer_engine/__init__.py
index e51f03e3d8..c3fb004659 100644
--- a/transformer_engine/__init__.py
+++ b/transformer_engine/__init__.py
@@ -10,6 +10,37 @@
 from importlib import metadata
 import transformer_engine.common
 
+import torch
+
+# Public, simple global (kept for backward compatibility).
+TE_DEVICE_TYPE = "cuda"
+TE_PLATFORM = torch.cuda
+
+# Apply MUSA (VENDOR) Patches, such as torch.cuda.device -> torch.musa.device
+try:
+    from .plugin.core.backends.vendor.musa.patches import apply_patch as _musa_apply_patch
+
+    _musa_apply_patch()
+    print("[TE-FL] MUSA patches applied")
+except Exception as e:
+    print(f"[TE-FL] MUSA patches not applied: {e}")
+    pass
+
+
+def te_device_type(default: str = "cuda") -> str:
+    try:
+        return TE_DEVICE_TYPE
+    except Exception:
+        return default
+
+
+def te_platform(default=torch.cuda):
+    try:
+        return TE_PLATFORM
+    except Exception:
+        return default
+
+
 try:
     from . import pytorch
 except ImportError:
diff --git a/transformer_engine/debug/features/fake_quant.py b/transformer_engine/debug/features/fake_quant.py
index 58c7379b5b..00c1096351 100644
--- a/transformer_engine/debug/features/fake_quant.py
+++ b/transformer_engine/debug/features/fake_quant.py
@@ -14,6 +14,7 @@
 
 
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.debug.features.api import TEConfigAPIMapper
 from transformer_engine.common.recipe import Format
 from transformer_engine.pytorch.tensor import Quantizer
@@ -30,7 +31,9 @@ def fake_quantize(tensor: torch.Tensor, fp8_format: tex.DType, out=None):
         torch.float16,
         torch.bfloat16,
     ), "[NVTORCH INSPECT ERROR] Unsupported tensor type."
-    assert tensor.is_cuda, "[NVTORCH INSPECT ERROR] Must be a GPU tensor."
+    assert (
+        tensor.device.type == te_device_type()
+    ), f"[NVTORCH INSPECT ERROR] Must be a {te_device_type()} tensor."
     assert fp8_format in {
         "FP8E4M3",
         "FP8E5M2",
diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index d09fb10579..290eb8c35d 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -14,6 +14,7 @@
 from nvdlfw_inspect.debug_features.log_tensor_stats import LogTensorStats as BaseLogTensorStats
 from nvdlfw_inspect.registry import Registry, api_method
 
+from transformer_engine import te_device_type
 from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
 from transformer_engine.pytorch.tensor import Quantizer, QuantizedTensor
 from transformer_engine.pytorch.tensor.float8_tensor import (
@@ -47,7 +48,10 @@ def _get_new_quantizer(recipe_name, fp8_dtype):
         return Float8BlockQuantizer(fp8_dtype=fp8_dtype, rowwise=True, columnwise=True)
     if recipe_name == "fp8_current_scaling":
         return Float8CurrentScalingQuantizer(
-            fp8_dtype=fp8_dtype, device=torch.device("cuda"), rowwise=True, columnwise=True
+            fp8_dtype=fp8_dtype,
+            device=torch.device(te_device_type()),
+            rowwise=True,
+            columnwise=True,
         )
     if recipe_name == "mxfp8":
         return MXFP8Quantizer(fp8_dtype=fp8_dtype, rowwise=True, columnwise=True)
diff --git a/transformer_engine/debug/features/per_tensor_scaling.py b/transformer_engine/debug/features/per_tensor_scaling.py
index dd1f42cf06..10ee77a474 100644
--- a/transformer_engine/debug/features/per_tensor_scaling.py
+++ b/transformer_engine/debug/features/per_tensor_scaling.py
@@ -11,7 +11,9 @@
 import nvdlfw_inspect.api as debug_api
 from nvdlfw_inspect.registry import Registry, api_method
 
+
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.tensor import Quantizer
 from transformer_engine.pytorch.tensor.float8_tensor import (
     Float8Tensor,
@@ -33,7 +35,9 @@ def per_tensor_cast(
         torch.float16,
         torch.bfloat16,
     ), "[NVTORCH INSPECT ERROR] Unsupported tensor type for per tensor current scaling"
-    assert tensor.is_cuda, "[NVTORCH INSPECT ERROR] Must be a GPU tensor."
+    assert (
+        tensor.device.type == te_device_type()
+    ), f"[NVTORCH INSPECT ERROR] Must be a {te_device_type()} tensor."
     assert fp8_dtype in {
         tex.DType.kFloat8E4M3,
         tex.DType.kFloat8E5M2,
diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py
index 20236fb950..e570443d5b 100644
--- a/transformer_engine/debug/features/utils/stats_buffer.py
+++ b/transformer_engine/debug/features/utils/stats_buffer.py
@@ -16,6 +16,7 @@
 from nvdlfw_inspect.utils import gather_along_first_dim
 from nvdlfw_inspect.logging import MetricLogger
 
+from transformer_engine import te_device_type
 from transformer_engine.debug.features.utils.stats_computation import (
     STATS,
     DEPENDENCIES,
@@ -41,14 +42,14 @@ def __init__(self, layer_name, tensor_name, stats, reduction_group, reduce_withi
         for stat in stats:
             self.stats_to_compute = self.stats_to_compute | DEPENDENCIES[stat]
 
-        self._buffer = torch.zeros(len(STATS), dtype=torch.float32).cuda()
+        self._buffer = torch.zeros(len(STATS), dtype=torch.float32).to(te_device_type())
         self._new_buffer = self._buffer.clone()
         self._tmp_buffer = self._buffer.clone()
 
         # in case of data parallelism it is possible that layer will not be run on one node
         # modified is set to True if node is run
         # we do not take not run nodes into account
-        self.modified = torch.tensor([False], dtype=torch.bool).cuda()
+        self.modified = torch.tensor([False], dtype=torch.bool).to(te_device_type())
         self.iteration = None
         self.skip_reduction = False
 
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index 8f2e9aeb41..f967dc54d8 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -9,6 +9,7 @@
 from packaging.version import Version as PkgVersion
 
 import torch
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.utils import (
     get_device_compute_capability,
 )
@@ -283,8 +284,10 @@ def _forward_impl(
             for x in [query_layer, key_layer, value_layer]
         ), "FLAttention only supports FP16 and BF16 data types, or Float8Tensors."
         assert (
-            query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
-        ), "FLAttention only supports CUDA tensors."
+            query_layer.device.type == te_device_type()
+            and key_layer.device.type == te_device_type()
+            and value_layer.device.type == te_device_type()
+        ), f"FLAttention only supports {te_device_type()} tensors."
         assert qkv_layout in QKVLayouts, f"FLAttention does not support qkv_layout = {qkv_layout}!"
 
         cp_size = 1
diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index fd8a61f492..d33bcf1411 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -243,7 +243,7 @@ def get_cudnn_version(self) -> int:
         return 90000
 
     def get_num_cublas_streams(self) -> int:
-        return 0
+        return 4  # keep consistent with transformer_engine/common/util/multi_stream.cpp, get_num_compute_streams()
 
     ############## class func #################################
     def get_flash_attention_class(self):
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
index 0136b6a983..d744cdda41 100644
--- a/transformer_engine/plugin/core/backends/flagos/register_ops.py
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -124,6 +124,22 @@ def register_builtins(registry) -> None:
             vendor=None,
             priority=150,
         ),
+        OpImpl(
+            op_name="get_num_cublas_streams",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.get_num_cublas_streams, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="get_cudnn_version",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.get_cudnn_version, is_avail),
+            vendor=None,
+            priority=150,
+        ),
     ]
 
     registry.register_many(impls)
diff --git a/transformer_engine/plugin/core/backends/reference/impl/softmax.py b/transformer_engine/plugin/core/backends/reference/impl/softmax.py
index 1783ada92b..2689ab938a 100644
--- a/transformer_engine/plugin/core/backends/reference/impl/softmax.py
+++ b/transformer_engine/plugin/core/backends/reference/impl/softmax.py
@@ -44,20 +44,35 @@ def scaled_masked_softmax_forward_torch(
     mask: torch.Tensor,
     scale: float,
 ) -> torch.Tensor:
-    # Handle uint8 mask (CUDA format: 1=masked, 0=unmasked)
-    # Convert to additive mask (-10000 for masked positions, 0 for unmasked)
-    if mask.dtype == torch.uint8:
-        additive_mask = torch.zeros_like(input, dtype=input.dtype)
-        # Expand mask if needed (mask shape: batch, 1, seq_q, seq_k)
-        if mask.dim() == 4 and mask.size(1) == 1 and input.dim() == 4:
-            mask = mask.expand_as(input)
-        additive_mask = additive_mask.masked_fill(mask.bool(), -10000.0)
-    else:
-        additive_mask = mask
-
-    scaled_input = input * scale + additive_mask
-
-    return F.softmax(scaled_input, dim=-1)
+    """Reference forward matching TE CUDA `scaled_masked_softmax_warp_forward`.
+
+    Integer/bool mask (same as uint8 kernel contract):
+      - **Exactly** ``mask == 1`` means **masked** (logit set to ``-10000``, not ``input*scale`` offset).
+      - Any other value (typically 0) means **unmasked** (logit is ``input * scale``).
+
+    Floating mask: treated as **additive** bias in logit space (already scaled), added after
+    ``input * scale``.
+
+    Common pitfalls this avoids vs the old implementation:
+    1) ``input * scale + (-10000)`` on masked positions ≠ CUDA's plain ``-10000``.
+    2) Non-uint8 masks (bool, int) were used as direct addends → wrong (0/1 added to logits).
+    3) ``mask.bool()`` masks any nonzero byte; CUDA only masks when ``mask == 1``.
+    """
+    if mask.dim() == 4 and mask.size(1) == 1 and input.dim() == 4:
+        mask = mask.expand_as(input)
+
+    scaled = input * scale
+
+    if mask.is_floating_point():
+        scaled = scaled + mask.to(dtype=scaled.dtype)
+        return F.softmax(scaled, dim=-1)
+
+    # Integer / bool: align with CUDA (masked iff value == 1)
+    scaled = scaled.masked_fill(mask == 1, -10000.0)
+    # CUDA zeros output row when every position in the softmax dim is masked (max == -10000)
+    all_masked = (mask == 1).all(dim=-1, keepdim=True)
+    out = F.softmax(scaled, dim=-1)
+    return out.masked_fill(all_masked, 0.0)
 
 
 def scaled_masked_softmax_backward_torch(
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
index 984d62022f..9755d85373 100644
--- a/transformer_engine/plugin/core/backends/reference/reference.py
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -476,7 +476,7 @@ def get_cudnn_version(self) -> int:
         return 0
 
     def get_num_cublas_streams(self) -> int:
-        return 0
+        return 4  # keep consistent with transformer_engine/common/util/multi_stream.cpp, get_num_compute_streams()
 
     # Multi-tensor functions
     def multi_tensor_scale(
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/patches.py b/transformer_engine/plugin/core/backends/vendor/musa/patches.py
new file mode 100644
index 0000000000..220c5be03f
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/vendor/musa/patches.py
@@ -0,0 +1,72 @@
+"""Python-side compatibility patches for the MUSA vendor backend."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import torch
+
+
+def _noop(*args, **kwargs):
+    return None
+
+
+# Patches: (parent_object, attribute_name, replacement_callable)
+_PATCH_CALLS: list[tuple[object, str, Callable[..., object]]] = [
+    # We do not recommend replace is_available, due to its device-related behavior.
+    # (torch.cuda, "is_available", torch.musa.is_available),
+    (torch.cuda, "get_device_properties", torch.musa.get_device_properties),
+    (torch.cuda, "device", torch.musa.device),
+    (torch.cuda, "current_device", torch.musa.current_device),
+    (torch.cuda, "synchronize", torch.musa.synchronize),
+    (torch.cuda, "is_current_stream_capturing", torch.musa.is_current_stream_capturing),
+    # TODO: Add NVTX patches for MUSA.
+    # NVTX is CUDA-specific; make it a no-op on MUSA.
+    (torch.cuda.nvtx, "range_push", _noop),
+    (torch.cuda.nvtx, "range_pop", _noop),
+    # TODO: Add other patches for MUSA.
+]
+
+
+def apply_patch() -> None:
+    """Apply MUSA Python-side patches (idempotent, best-effort)."""
+    try:
+        from .musa import MUSABackend
+
+        if not MUSABackend().is_available():
+            return
+    except Exception as e:
+        print(f"[TE-FL] MUSA backend not available: {e}")
+        # If backend availability can't be determined, don't patch.
+        return
+
+    # Mark TE global device type for Python-side callers.
+    # IMPORTANT: do not import `transformer_engine` here, because TE's `__init__.py`
+    # imports this module to run patches and that would cause a circular import.
+    try:
+        import transformer_engine
+
+        transformer_engine.TE_DEVICE_TYPE = "musa"
+        transformer_engine.TE_PLATFORM = torch.musa
+    except Exception as e:
+        print(f"[TE-FL Musa Patches] Error setting TE device type or platform: {e}")
+        # Best-effort: don't fail patching if we can't set the global.
+        pass
+
+    # Only patch when torch.musa exists and is usable.
+    if not hasattr(torch, "musa"):
+        return
+    try:
+        if not torch.musa.is_available():
+            return
+    except Exception:
+        return
+
+    for parent, attr, replacement in _PATCH_CALLS:
+        if not hasattr(parent, attr):
+            continue
+        try:
+            setattr(parent, attr, replacement)
+        except Exception:
+            # Best-effort: patching should never crash import/initialization.
+            continue
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
index 95558e30da..270e6a2ee8 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/backends.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -15,6 +15,7 @@
 import torch
 import torch.nn.functional as F
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.utils import (
     get_device_compute_capability,
     split_tensor_along_dim,
@@ -387,10 +388,10 @@ def forward(
                 fp8_recipe = fp8_meta["local_recipes"][0]
             if fp8_recipe.float8_current_scaling():
                 S_quantizer = Float8CurrentScalingQuantizer(
-                    fp8_dtype=S_quantizer.dtype, device="cuda"
+                    fp8_dtype=S_quantizer.dtype, device=te_device_type()
                 )
                 dP_quantizer = Float8CurrentScalingQuantizer(
-                    fp8_dtype=dP_quantizer.dtype, device="cuda"
+                    fp8_dtype=dP_quantizer.dtype, device=te_device_type()
                 )
 
             if "2" in qkv_layout or "3" in qkv_layout:
@@ -676,8 +677,10 @@ def forward(
             for x in [query_layer, key_layer, value_layer]
         ), "FlashAttention only supports FP16 and BF16 data types, or Float8Tensors."
         assert (
-            query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
-        ), "FlashAttention currently only supports CUDA tensors."
+            query_layer.device.type == te_device_type()
+            and key_layer.device.type == te_device_type()
+            and value_layer.device.type == te_device_type()
+        ), f"FlashAttention currently only supports {te_device_type()} tensors."
         assert (
             qkv_layout in QKVLayouts
         ), f"FlashAttention does not support qkv_layout = {qkv_layout}!"
@@ -1738,8 +1741,10 @@ def forward(
             for x in [query_layer, key_layer, value_layer]
         ), "FusedAttention only supports FP16 and BF16 data types, or Float8Tensors."
         assert (
-            query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
-        ), "FusedAttention only supports CUDA tensors."
+            query_layer.device.type == te_device_type()
+            and key_layer.device.type == te_device_type()
+            and value_layer.device.type == te_device_type()
+        ), f"FusedAttention only supports {te_device_type()} tensors."
         assert (
             qkv_layout in QKVLayouts
         ), f"FusedAttention does not support qkv_layout = {qkv_layout}!"
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index 4e5a79e668..8c96f66aaa 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -14,6 +14,7 @@
 from torch.nn.parameter import Parameter
 
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import (
     Format,
     Recipe,
@@ -420,12 +421,14 @@ def __init__(
             self.softmax_offset = None
         if self.softmax_type == "off-by-one":
             self.softmax_offset = torch.zeros(
-                self.num_attention_heads // self.tp_size, device="cuda"
+                self.num_attention_heads // self.tp_size, device=te_device_type()
             )
         if self.softmax_type == "learnable":
             self.register_parameter(
                 "softmax_offset",
-                Parameter(torch.empty(self.num_attention_heads // self.tp_size, device="cuda")),
+                Parameter(
+                    torch.empty(self.num_attention_heads // self.tp_size, device=te_device_type())
+                ),
                 get_rng_state_tracker=get_rng_state_tracker,
             )
 
@@ -1026,8 +1029,10 @@ def forward(
 
             # checks for q/k/v shapes
             assert (
-                query_layer.is_cuda and key_layer.is_cuda and value_layer.is_cuda
-            ), "DotProductAttention only supports CUDA tensors."
+                query_layer.device.type == te_device_type()
+                and key_layer.device.type == te_device_type()
+                and value_layer.device.type == te_device_type()
+            ), f"DotProductAttention only supports {te_device_type()} tensors."
             assert (
                 query_layer.dtype == key_layer.dtype and query_layer.dtype == value_layer.dtype
             ), "Queries, keys and values must have the same data type!"
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/softmax.py b/transformer_engine/pytorch/attention/dot_product_attention/softmax.py
index df10fc7905..57e5d4f425 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/softmax.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/softmax.py
@@ -8,6 +8,7 @@
 import torch
 from torch import nn
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.export import is_in_onnx_export_mode
 
 
@@ -24,7 +25,7 @@ def _get_default_causal_mask(mask_type: str, sq: int, sk: int) -> torch.Tensor:
     def _get_mask():
         diagonal_offset = sk - sq + 1 if "bottom_right" in mask_type else 1
         return torch.triu(
-            torch.ones(sq, sk, dtype=torch.bool, device="cuda"), diagonal=diagonal_offset
+            torch.ones(sq, sk, dtype=torch.bool, device=te_device_type()), diagonal=diagonal_offset
         )
 
     if is_in_onnx_export_mode():
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index 6bcc9f25da..ae36eb4160 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -21,6 +21,7 @@
 import torch.nn.functional as F
 import transformer_engine_torch as tex
 import transformer_engine as te
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.cpp_extensions.fused_attn import (
     QKVLayout,
     AttnBiasType,
@@ -1193,13 +1194,13 @@ def get_padding_mask(
                 ],
                 dim=0,
             )
-    attention_mask_q = attention_mask_q.to(device="cuda")
+    attention_mask_q = attention_mask_q.to(device=te_device_type())
     if attention_type == "self":
         attention_mask = attention_mask_q
     else:
         attention_mask = (
             attention_mask_q,
-            attention_mask_kv.to(device="cuda"),
+            attention_mask_kv.to(device=te_device_type()),
         )
     return attention_mask
 
@@ -1318,9 +1319,11 @@ def get_full_mask(
         actual_seqlens_kv = m[:, 0, 0, :].sum(dim=1)
 
     # apply SWA mask
-    mask = torch.arange(max_seqlen_q, dtype=torch.int32, device="cuda").view(
+    mask = torch.arange(max_seqlen_q, dtype=torch.int32, device=te_device_type()).view(
         1, 1, max_seqlen_q, 1
-    ) - torch.arange(max_seqlen_kv, dtype=torch.int32, device="cuda").view(1, 1, 1, max_seqlen_kv)
+    ) - torch.arange(max_seqlen_kv, dtype=torch.int32, device=te_device_type()).view(
+        1, 1, 1, max_seqlen_kv
+    )
     swa_left = None
     swa_right = None
     if attn_mask_type == "causal_bottom_right" or (
@@ -1416,7 +1419,7 @@ def get_alibi(
                 m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (num_heads - n), 2))
                 m = torch.cat([m, m_hat])
 
-            _alibi_cache["_alibi_slopes"] = m.to(dtype=torch.float32, device="cuda")
+            _alibi_cache["_alibi_slopes"] = m.to(dtype=torch.float32, device=te_device_type())
         _alibi_cache["_num_heads"] = num_heads
         _alibi_cache["_alibi_slopes_require_update"] = False
 
@@ -1429,9 +1432,9 @@ def get_alibi(
         else:
             raise ValueError("ALiBi slopes cannot exceed 2 dimensions.")
 
-        bias = torch.arange(max_seqlen_q, dtype=torch.int32, device="cuda").view(
+        bias = torch.arange(max_seqlen_q, dtype=torch.int32, device=te_device_type()).view(
             1, 1, max_seqlen_q, 1
-        ) - torch.arange(max_seqlen_kv, dtype=torch.int32, device="cuda").view(
+        ) - torch.arange(max_seqlen_kv, dtype=torch.int32, device=te_device_type()).view(
             1, 1, 1, max_seqlen_kv
         )
         if actual_seqlens_q is None and actual_seqlens_kv is None:
@@ -1451,7 +1454,9 @@ def get_alibi(
         _alibi_cache["_max_seqlen_q"], _alibi_cache["_max_seqlen_kv"] = max_seqlen_q, max_seqlen_kv
         _alibi_cache["_bottom_right_alignment"] = bottom_right_alignment
         bias_dtype = torch.float32 if bias_dtype is None else bias_dtype
-        _alibi_cache["_alibi_bias"] = bias.contiguous().to(dtype=bias_dtype, device="cuda")
+        _alibi_cache["_alibi_bias"] = bias.contiguous().to(
+            dtype=bias_dtype, device=te_device_type()
+        )
         _alibi_cache["_alibi_bias_require_update"] = False
 
     return _alibi_cache["_alibi_slopes"], _alibi_cache["_alibi_bias"]
@@ -1466,7 +1471,7 @@ def get_cu_seqlens(mask: torch.Tensor) -> torch.Tensor:
     mask = mask.squeeze(1).squeeze(1)
     reduced_mask = mask.logical_not().sum(dim=1)
     cu_seqlens = reduced_mask.cumsum(dim=0).to(torch.int32)
-    zero = torch.zeros(1, dtype=torch.int32, device="cuda")
+    zero = torch.zeros(1, dtype=torch.int32, device=te_device_type())
     cu_seqlens = torch.cat((zero, cu_seqlens))
 
     return cu_seqlens
@@ -1484,7 +1489,7 @@ def get_cu_seqlens_and_indices(mask: torch.Tensor) -> Tuple[torch.Tensor, torch.
 
     reduced_mask = mask.logical_not().sum(dim=1)
     cu_seqlens = reduced_mask.cumsum(dim=0).to(torch.int32)
-    zero = torch.zeros(1, dtype=torch.int32, device="cuda")
+    zero = torch.zeros(1, dtype=torch.int32, device=te_device_type())
     cu_seqlens = torch.cat((zero, cu_seqlens))
 
     mask = mask.reshape(-1)
@@ -1509,7 +1514,12 @@ def get_indices(max_seqlen: int, cu_seqlens: torch.Tensor) -> torch.Tensor:
     bs = len(cu_seqlens) - 1
     seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
     indices = [i * max_seqlen + ii for i, j in enumerate(seqlens) for ii in range(j)]
-    indices = torch.Tensor(indices).unsqueeze(1).unsqueeze(1).to(dtype=torch.int64, device="cuda")
+    indices = (
+        torch.Tensor(indices)
+        .unsqueeze(1)
+        .unsqueeze(1)
+        .to(dtype=torch.int64, device=te_device_type())
+    )
 
     num_nonzeros = indices.shape[0]
     pad_amount = bs * max_seqlen - num_nonzeros
diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py
index f0ef8d0bd5..fabc491835 100644
--- a/transformer_engine/pytorch/attention/inference.py
+++ b/transformer_engine/pytorch/attention/inference.py
@@ -11,6 +11,7 @@
 import torch
 
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.cpp_extensions.fused_attn import QKVFormat
 
 __all__ = ["InferenceParams", "KVCacheManager", "NonPagedKVCacheManager", "PagedKVCacheManager"]
@@ -626,7 +627,7 @@ def __init__(
         self.allocated_pages = defaultdict(list)
         # page table, [batch_size, max_pages_per_seq]
         self.page_table = torch.zeros(
-            self.max_batch_size, self.max_pages_per_seq, dtype=torch.int32, device="cuda"
+            self.max_batch_size, self.max_pages_per_seq, dtype=torch.int32, device=te_device_type()
         )
 
     def reset(self):
diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py
index b3bda677bb..54c9beb653 100644
--- a/transformer_engine/pytorch/attention/multi_head_attention.py
+++ b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -8,6 +8,7 @@
 from typing import Callable, List, Optional, Tuple, Union
 import torch
 
+from transformer_engine import te_device_type
 from transformer_engine.debug.pytorch.debug_state import TEDebugState
 from transformer_engine.pytorch.quantization import FP8GlobalStateManager
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor
@@ -255,7 +256,7 @@ def __init__(
         ub_bulk_wgrad: bool = False,
         bias: bool = True,
         normalization: str = "LayerNorm",
-        device: Union[torch.device, str] = "cuda",
+        device: Union[torch.device, str] = te_device_type(),
         qkv_format: str = "sbhd",
         name: str = None,
         qk_norm_type: Optional[str] = None,
diff --git a/transformer_engine/pytorch/attention/rope.py b/transformer_engine/pytorch/attention/rope.py
index cc23d65a3e..bbd5221381 100644
--- a/transformer_engine/pytorch/attention/rope.py
+++ b/transformer_engine/pytorch/attention/rope.py
@@ -9,6 +9,7 @@
 import torch
 
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.cpp_extensions.fused_attn import QKVFormat
 
 
@@ -76,7 +77,7 @@ def forward(self, max_seq_len: int, offset: int = 0):
         offset: int, default = 0
             Fixed offset for frequencies.
         """
-        with torch.autocast(enabled=False, device_type="cuda"):
+        with torch.autocast(enabled=False, device_type=te_device_type()):
             seq = (
                 torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
                 + offset
diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py
index a45fafb68a..68c2c20cca 100644
--- a/transformer_engine/pytorch/cpp_extensions/gemm.py
+++ b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -8,6 +8,9 @@
 import os
 import torch
 import transformer_engine_torch as tex
+
+from transformer_engine import te_device_type
+
 from ..constants import TE_DType
 from ..utils import get_sm_count, _empty_tensor
 
@@ -189,7 +192,8 @@ def general_grouped_gemm(
     sm_count = get_sm_count()
     if grad and use_bias:
         grad_bias = [
-            torch.empty(B[i].shape[1], dtype=out[0].dtype, device="cuda") for i in range(num_gemms)
+            torch.empty(B[i].shape[1], dtype=out[0].dtype, device=te_device_type())
+            for i in range(num_gemms)
         ]
     else:
         grad_bias = empty_tensors
diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py
index 5ed73f6783..904f308d1d 100644
--- a/transformer_engine/pytorch/distributed.py
+++ b/transformer_engine/pytorch/distributed.py
@@ -29,6 +29,8 @@
 
 import transformer_engine_torch as tex
 
+from transformer_engine import te_device_type
+
 from . import torch_version
 from .utils import (
     is_non_tn_fp8_gemm_supported,
@@ -90,7 +92,7 @@ def graph_safe_rng_available() -> bool:
 
 
 def _get_cuda_rng_state(
-    device: Union[int, str, torch.device] = "cuda",
+    device: Union[int, str, torch.device] = te_device_type(),
     clone: bool = False,
     graph_safe: bool = True,
 ) -> torch.Tensor:
@@ -100,7 +102,7 @@ def _get_cuda_rng_state(
     if isinstance(device, str):
         device = torch.device(device)
     elif isinstance(device, int):
-        device = torch.device("cuda", device)
+        device = torch.device(te_device_type(), device)
     idx = device.index
     if idx is None:
         idx = torch.cuda.current_device()
@@ -122,11 +124,11 @@ def _set_cuda_rng_state(
     """Sets the random number generator state of the current GPU."""
 
     if device == -1:
-        device = torch.device("cuda")
+        device = torch.device(te_device_type())
     elif isinstance(device, str):
         device = torch.device(device)
     elif isinstance(device, int):
-        device = torch.device("cuda", device)
+        device = torch.device(te_device_type(), device)
 
     def cb() -> None:
         idx = device.index
@@ -280,10 +282,10 @@ def _get_active_autocast_contexts():
     autocast_cached = torch.is_autocast_cache_enabled()
 
     if torch_version() >= (2, 4, 0):
-        gpu_autocast_enabled = torch.is_autocast_enabled("cuda")
-        gpu_autocast_dtype = torch.get_autocast_dtype("cuda")
+        gpu_autocast_enabled = torch.is_autocast_enabled(te_device_type())
+        gpu_autocast_dtype = torch.get_autocast_dtype(te_device_type())
         gpu_autocast_ctx = torch.amp.autocast(
-            "cuda",
+            te_device_type(),
             enabled=gpu_autocast_enabled,
             dtype=gpu_autocast_dtype,
             cache_enabled=autocast_cached,
@@ -943,7 +945,7 @@ def _all_gather_fp8(
     out: Float8TensorStorage
     if quantizer is not None:
         dtype = torch.float32
-        device = "cuda"
+        device = te_device_type()
         if isinstance(inp, Float8Tensor):
             dtype = inp.dtype
             device = inp.device
diff --git a/transformer_engine/pytorch/jit.py b/transformer_engine/pytorch/jit.py
index f0f77621e5..32a8deaf45 100644
--- a/transformer_engine/pytorch/jit.py
+++ b/transformer_engine/pytorch/jit.py
@@ -3,11 +3,15 @@
 # See LICENSE for license information.
 
 """NVFuser functions and JIT utilities"""
+
+# pylint: disable=ungrouped-imports
+
 import os
 from functools import wraps
 from typing import Callable, Optional, Tuple
 import torch
 
+from transformer_engine import te_device_type
 from . import torch_version
 from .export import is_in_onnx_export_mode
 from .utils import gpu_autocast_ctx
@@ -277,9 +281,13 @@ def warmup_jit_bias_dropout_add(
     # Save cuda RNG state to ensure warmup does not affect reproducibility.
     rng_state = torch.cuda.get_rng_state()
 
-    inp = torch.rand((seq_length, micro_batch_size, hidden_size), dtype=dtype, device="cuda")
-    residual = torch.rand((seq_length, micro_batch_size, hidden_size), dtype=dtype, device="cuda")
-    bias = torch.rand((hidden_size), dtype=dtype, device="cuda")
+    inp = torch.rand(
+        (seq_length, micro_batch_size, hidden_size), dtype=dtype, device=te_device_type()
+    )
+    residual = torch.rand(
+        (seq_length, micro_batch_size, hidden_size), dtype=dtype, device=te_device_type()
+    )
+    bias = torch.rand((hidden_size), dtype=dtype, device=te_device_type())
     dropout_rate = 0.1
     # Warmup JIT fusions with the input grad_enable state of both forward
     # prop and recomputation
@@ -314,11 +322,11 @@ def warmup_jit_bias_gelu(
     # Save cuda RNG state to ensure warmup does not affect reproducibility.
     rng_state = torch.cuda.get_rng_state()
 
-    bias = torch.rand(ffn_hidden_size_per_partition, dtype=dtype, device="cuda")
+    bias = torch.rand(ffn_hidden_size_per_partition, dtype=dtype, device=te_device_type())
     inp = torch.rand(
         (seq_length * micro_batch_size, ffn_hidden_size_per_partition),
         dtype=dtype,
-        device="cuda",
+        device=te_device_type(),
     )
     # Warmup JIT fusions with the input grad_enable state of both forward
     # prop and recomputation
@@ -352,7 +360,7 @@ def warmup_jit_l2normalization(
     inp = torch.rand(
         (seq_length * micro_batch_size, hidden_size),
         dtype=dtype,
-        device="cuda",
+        device=te_device_type(),
     )
     eps = 1e-6
     # Warmup JIT fusions with the input grad_enable state of both forward
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index d16455b5b4..06d0de5072 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -19,8 +19,10 @@
 import torch.nn.functional as F
 
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type, te_platform
 from transformer_engine.common.recipe import Recipe
 
+
 from ._common import _ParameterInitMeta, noop_cat
 from ..quantization import (
     MXFP8BlockScalingRecipeState,
@@ -87,7 +89,9 @@ def get_workspace() -> torch.Tensor:
     global _cublas_workspace
     if _cublas_workspace is None:
         _cublas_workspace = torch.empty(
-            get_cublas_workspace_size_bytes(), dtype=torch.uint8, device="cuda"
+            get_cublas_workspace_size_bytes(),
+            dtype=torch.uint8,
+            device=te_device_type(),
         )
     return _cublas_workspace
 
@@ -98,7 +102,9 @@ def get_multi_stream_cublas_workspace() -> List[torch.Tensor]:
     if not _multi_stream_cublas_workspace:
         for _ in range(tex.get_num_cublas_streams()):
             _multi_stream_cublas_workspace.append(
-                torch.empty(get_cublas_workspace_size_bytes(), dtype=torch.uint8, device="cuda")
+                torch.empty(
+                    get_cublas_workspace_size_bytes(), dtype=torch.uint8, device=te_device_type()
+                )
             )
     return _multi_stream_cublas_workspace
 
@@ -111,7 +117,7 @@ def get_dummy_wgrad(shape: list, dtype: torch.dtype, zero=False) -> torch.Tensor
         _dummy_wgrads[(shape[0], shape[1], dtype)] = torch.empty(
             shape,
             dtype=dtype,
-            device="cuda",
+            device=te_device_type(),
             requires_grad=False,
         )
     if zero:
@@ -282,7 +288,9 @@ def initialize_ub(
     elif _cublas_workspace.numel() != get_cublas_workspace_size_bytes() * _NUM_MAX_UB_STREAMS:
         # This ensures we don't do `.repeat()` on an already expanded workspace
         _cublas_workspace = torch.empty(
-            get_cublas_workspace_size_bytes(), dtype=torch.uint8, device="cuda"
+            get_cublas_workspace_size_bytes(),
+            dtype=torch.uint8,
+            device=te_device_type(),
         ).repeat(_NUM_MAX_UB_STREAMS)
 
     # Default buffer precision: AllGather buffers use fp8 when using fp8 recipe
@@ -640,7 +648,7 @@ class TransformerEngineBaseModule(torch.nn.Module, ABC):
 
     def __init__(self) -> None:
         super().__init__()
-        assert torch.cuda.is_available(), "TransformerEngine needs CUDA."
+        assert te_platform().is_available(), f"TransformerEngine needs {te_device_type()}."
         self.name = None
         self.next_iter_when_debug_should_be_run = 0
         self.fp8_initialized = False
@@ -917,7 +925,7 @@ def set_extra_state(self, state: torch.Tensor) -> None:
         elif isinstance(state, io.BytesIO):
             # Deprecated format with io.BytesIO
             state.seek(0)
-            state = torch.load(state, map_location="cuda")
+            state = torch.load(state, map_location=te_device_type())
         else:
             raise RuntimeError("Unsupported checkpoint format.")
 
@@ -1080,7 +1088,9 @@ def prepare_forward(
         if self.fp8 and in_fp8_activation_recompute_phase():
             FP8GlobalStateManager.get_old_fp8_meta_tensors_for_recompute(self.fp8_meta)
         else:
-            assert inp.is_cuda, "TransformerEngine needs CUDA."
+            assert (
+                inp.device.type == te_device_type()
+            ), f"TransformerEngine needs {te_device_type()}."
 
             if self.tp_size > 1:
                 assert self.tp_group_initialized, "TP group not initialized."
@@ -1257,7 +1267,7 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
         for name, param in self.named_parameters(recurse=False):
             # Ensure parameter is on a real device
             if param.device == torch.device("meta"):
-                param = torch.empty_like(param, device="cuda")
+                param = torch.empty_like(param, device=te_device_type())
 
             # Initialize the parameter values on device
             init_fn = self.param_init_meta[name].init_fn
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index a5bf21ee17..9de94f0ec9 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -11,7 +11,10 @@
 
 import transformer_engine_torch as tex
 
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import Recipe
+
+
 from .base import (
     get_multi_stream_cublas_workspace,
     TransformerEngineBaseModule,
@@ -581,7 +584,7 @@ def __init__(
         return_bias: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         parallel_mode: Optional[str] = None,
-        device: Union[torch.device, str] = "cuda",
+        device: Union[torch.device, str] = te_device_type(),
         ub_overlap_rs: bool = False,
         ub_overlap_ag: bool = False,
         ub_name: Optional[str] = None,
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
index a2ddb970af..60db65b0e0 100644
--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -15,9 +15,12 @@
 
 import transformer_engine_torch as tex
 
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import Recipe
 from transformer_engine.pytorch import torch_version
 from transformer_engine.pytorch.tensor.utils import is_experimental
+
+
 from .base import (
     fill_userbuffers_buffer_for_all_gather,
     get_workspace,
@@ -1098,7 +1101,7 @@ def fc2_wgrad_gemm(
             reduce_scatter_out = None
             if ctx.ub_overlap_rs_dgrad:
                 reduce_scatter_out = torch.empty(
-                    fc1_dgrad_shape, dtype=ctx.activation_dtype, device="cuda"
+                    fc1_dgrad_shape, dtype=ctx.activation_dtype, device=te_device_type()
                 )
             if ctx.ub_bulk_wgrad:
                 gemm_out = ub_obj_fc1_wgrad.get_buffer(local_chunk=False)
@@ -1181,7 +1184,7 @@ def fc2_wgrad_gemm(
                 reduce_scatter_out = None
                 if ctx.ub_bulk_wgrad and ub_obj_fc1_wgrad.is_fp8_ubuf():
                     reduce_scatter_out = torch.empty(
-                        fc1_dgrad_shape, dtype=ctx.activation_dtype, device="cuda"
+                        fc1_dgrad_shape, dtype=ctx.activation_dtype, device=te_device_type()
                     )
 
                 # Arguments to include in wgrad GEMM closure
diff --git a/transformer_engine/pytorch/ops/basic/activation.py b/transformer_engine/pytorch/ops/basic/activation.py
index 8a754c6382..5aa0bc03c0 100644
--- a/transformer_engine/pytorch/ops/basic/activation.py
+++ b/transformer_engine/pytorch/ops/basic/activation.py
@@ -11,12 +11,16 @@
 import torch
 
 import transformer_engine_torch as tex
+
+from transformer_engine import te_device_type
+
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...tensor.float8_tensor import Float8CurrentScalingQuantizer, Quantizer
 from ...utils import clear_tensor_data
 from ..op import BasicOperation, OperationContext
 from .._common import maybe_dequantize
 
+
 __all__ = [
     "GELU",
     "GEGLU",
@@ -92,7 +96,7 @@ def op_forward(
         # Compute dtype
         dtype: torch.dtype
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = input_.dtype
         if dtype not in (torch.float32, torch.float16, torch.bfloat16):
diff --git a/transformer_engine/pytorch/ops/basic/basic_linear.py b/transformer_engine/pytorch/ops/basic/basic_linear.py
index 432d8c134b..18951a316e 100644
--- a/transformer_engine/pytorch/ops/basic/basic_linear.py
+++ b/transformer_engine/pytorch/ops/basic/basic_linear.py
@@ -12,6 +12,8 @@
 
 import torch
 
+from transformer_engine import te_device_type
+
 from ...cpp_extensions import general_gemm
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...distributed import (
@@ -967,7 +969,7 @@ def op_forward(
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = self.weight.dtype
 
diff --git a/transformer_engine/pytorch/ops/basic/bias.py b/transformer_engine/pytorch/ops/basic/bias.py
index 5ec0d2ce5e..e773c35197 100644
--- a/transformer_engine/pytorch/ops/basic/bias.py
+++ b/transformer_engine/pytorch/ops/basic/bias.py
@@ -10,6 +10,9 @@
 import torch
 
 import transformer_engine_torch as tex
+
+from transformer_engine import te_device_type
+
 from ..op import BasicOperation, OperationContext
 from ...utils import canonicalize_device, canonicalize_dtype
 from ...tensor import Quantizer
@@ -94,7 +97,7 @@ def reset_parameters(self) -> None:
 
         # Make sure parameter is initialized
         bias = self.bias
-        if bias.device.type != "cuda":
+        if bias.device.type != te_device_type():
             bias = torch.empty_like(bias, device=self.device)
         else:
             bias = bias.to(device=self.device)
diff --git a/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py b/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py
index 74bd3d1b32..90a16b1d9d 100644
--- a/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py
+++ b/transformer_engine/pytorch/ops/fused/forward_linear_bias_activation.py
@@ -10,6 +10,8 @@
 
 import torch
 
+from transformer_engine import te_device_type
+
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...quantization import FP8GlobalStateManager
 from ...tensor import Quantizer
@@ -95,7 +97,7 @@ def fuser_forward(
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = linear_op.weight.dtype
 
diff --git a/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py b/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py
index 6d5d553391..ab6c2a61b5 100644
--- a/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py
+++ b/transformer_engine/pytorch/ops/fused/forward_linear_bias_add.py
@@ -10,6 +10,10 @@
 
 import torch
 
+
+from transformer_engine import te_device_type
+
+
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...quantization import FP8GlobalStateManager
 from ...tensor import Quantizer
@@ -89,7 +93,7 @@ def fuser_forward(
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = linear_op.weight.dtype
 
diff --git a/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py b/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py
index 24788bcdfb..bfcc1c3f3c 100644
--- a/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py
+++ b/transformer_engine/pytorch/ops/fused/forward_linear_scale_add.py
@@ -10,6 +10,8 @@
 
 import torch
 
+from transformer_engine import te_device_type
+
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...quantization import FP8GlobalStateManager
 from ...tensor import Quantizer
@@ -71,7 +73,7 @@ def fuser_forward(
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = linear_op.weight.dtype
 
diff --git a/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py b/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py
index d95b2298fe..0759abbc0c 100644
--- a/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py
+++ b/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py
@@ -11,6 +11,9 @@
 import torch
 
 from transformer_engine_torch import CommOverlapType, bulk_overlap_ag_with_external_gemm
+
+from transformer_engine import te_device_type
+
 from ...cpp_extensions import general_gemm
 from ...distributed import get_distributed_world_size
 from ...module.base import (
@@ -176,7 +179,7 @@ def _functional_backward(
             else:
                 device = grad_output.device
         device = canonicalize_device(device)
-        if device.type != "cuda":
+        if device.type != te_device_type():
             raise ValueError(f"Only CUDA devices are supported (got {device})")
 
         # Check datatype
diff --git a/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py b/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py
index e20de53da3..08e7d92d42 100644
--- a/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py
+++ b/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py
@@ -11,6 +11,7 @@
 import torch
 
 from transformer_engine_torch import CommOverlapType
+from transformer_engine import te_device_type
 from ...cpp_extensions import general_gemm
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...distributed import get_distributed_world_size
@@ -156,7 +157,7 @@ def _functional_forward(
         """
 
         # Check device
-        if device.type != "cuda":
+        if device.type != te_device_type():
             raise ValueError(f"Only CUDA devices are supported (got {device})")
 
         # Check datatype
@@ -322,7 +323,7 @@ def fuser_forward(
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = linear_op.weight.dtype
 
diff --git a/transformer_engine/pytorch/optimizers/fused_adam.py b/transformer_engine/pytorch/optimizers/fused_adam.py
index 18f7e2031a..3d71f4ff63 100644
--- a/transformer_engine/pytorch/optimizers/fused_adam.py
+++ b/transformer_engine/pytorch/optimizers/fused_adam.py
@@ -12,6 +12,7 @@
 
 import torch
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
 from .multi_tensor_apply import multi_tensor_applier
 
@@ -178,7 +179,7 @@ def __init__(
             self._step_supports_amp_scaling = True
 
         # Skip buffer
-        self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda")
+        self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=te_device_type())
         self.multi_tensor_adam = tex.multi_tensor_adam
         self.multi_tensor_adam_param_remainder = tex.multi_tensor_adam_param_remainder
         self.multi_tensor_adam_fp8 = tex.multi_tensor_adam_fp8
diff --git a/transformer_engine/pytorch/permutation.py b/transformer_engine/pytorch/permutation.py
index ea3e67a57c..23dbbf3598 100644
--- a/transformer_engine/pytorch/permutation.py
+++ b/transformer_engine/pytorch/permutation.py
@@ -8,6 +8,7 @@
 import torch
 
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 import transformer_engine.pytorch.triton.permutation as triton_permutation
 from transformer_engine.pytorch.constants import TE_DType
 from transformer_engine.pytorch.tensor.quantized_tensor import QuantizedTensor
@@ -15,6 +16,7 @@
 from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockwiseQTensor
 from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor
 
+
 __all__ = [
     "moe_permute",
     "moe_unpermute",
@@ -42,8 +44,8 @@ def forward(
             return inp, torch.tensor([], device=inp.device)
 
         # Device check
-        assert inp.is_cuda, "TransformerEngine needs CUDA."
-        assert index.is_cuda, "TransformerEngine needs CUDA."
+        assert inp.device.type == te_device_type(), f"TransformerEngine needs {te_device_type()}."
+        assert index.device.type == te_device_type(), f"TransformerEngine needs {te_device_type()}."
         # Shape check
         assert inp.size(0) == index.size(0), "Permute not possible"
 
@@ -119,7 +121,9 @@ def forward(
 
         # None probs check
         if probs is not None:
-            assert probs.is_cuda, "TransformerEngine needs CUDA."
+            assert (
+                probs.device.type == te_device_type()
+            ), f"TransformerEngine needs {te_device_type()}."
 
             if probs.dtype != torch.float32:
                 warnings.warn(
@@ -136,8 +140,10 @@ def forward(
             probs = torch.empty(0)
 
         # Device check
-        assert inp.is_cuda, "TransformerEngine needs CUDA."
-        assert row_id_map.is_cuda, "TransformerEngine needs CUDA."
+        assert inp.device.type == te_device_type(), f"TransformerEngine needs {te_device_type()}."
+        assert (
+            row_id_map.device.type == te_device_type()
+        ), f"TransformerEngine needs {te_device_type()}."
 
         # Data type check
         dtype = TE_DType[inp.dtype]
@@ -197,10 +203,14 @@ def forward(
             ctx.probs = probs
             return inp, torch.tensor([], device=inp.device), torch.tensor([], device=inp.device)
 
-        assert inp.is_cuda, "TransformerEngine needs CUDA."
-        assert routing_map.is_cuda, "TransformerEngine needs CUDA."
+        assert inp.device.type == te_device_type(), f"TransformerEngine needs {te_device_type()}."
+        assert (
+            routing_map.device.type == te_device_type()
+        ), f"TransformerEngine needs {te_device_type()}."
         if probs is not None:
-            assert probs.is_cuda, "TransformerEngine needs CUDA."
+            assert (
+                probs.device.type == te_device_type()
+            ), f"TransformerEngine needs {te_device_type()}."
 
         assert inp.size(0) == routing_map.size(0), "Permute not possible"
         num_tokens, hidden_size = inp.size()
@@ -353,11 +363,15 @@ def forward(
 
         with_probs = merging_probs is not None
         if with_probs:
-            assert merging_probs.is_cuda, "TransformerEngine needs CUDA."
+            assert (
+                merging_probs.device.type == te_device_type()
+            ), f"TransformerEngine needs {te_device_type()}."
 
         # Device check
-        assert inp.is_cuda, "TransformerEngine needs CUDA."
-        assert row_id_map.is_cuda, "TransformerEngine needs CUDA."
+        assert inp.device.type == te_device_type(), f"TransformerEngine needs {te_device_type()}."
+        assert (
+            row_id_map.device.type == te_device_type()
+        ), f"TransformerEngine needs {te_device_type()}."
 
         assert not isinstance(
             inp, QuantizedTensor
@@ -635,11 +649,17 @@ def forward(
         if not inp.numel():
             return inp, probs
 
-        assert inp.is_cuda, "TransformerEngine needs CUDA."
-        assert split_sizes.is_cuda, "TransformerEngine needs CUDA."
-        assert sorted_idxs.is_cuda, "TransformerEngine needs CUDA."
+        assert inp.device.type == te_device_type(), f"TransformerEngine needs {te_device_type()}."
+        assert (
+            split_sizes.device.type == te_device_type()
+        ), f"TransformerEngine needs {te_device_type()}."
+        assert (
+            sorted_idxs.device.type == te_device_type()
+        ), f"TransformerEngine needs {te_device_type()}."
         if probs is not None:
-            assert probs.is_cuda, "TransformerEngine needs CUDA."
+            assert (
+                probs.device.type == te_device_type()
+            ), f"TransformerEngine needs {te_device_type()}."
 
         num_tokens, hidden_size = inp.shape
         num_splits = split_sizes.size(0)
diff --git a/transformer_engine/pytorch/quantization.py b/transformer_engine/pytorch/quantization.py
index 030370b9db..9ea48964ea 100644
--- a/transformer_engine/pytorch/quantization.py
+++ b/transformer_engine/pytorch/quantization.py
@@ -16,6 +16,7 @@
 
 import torch
 import transformer_engine_torch as tex
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import (
     Recipe,
     DelayedScaling,
@@ -27,6 +28,7 @@
     CustomRecipe,
 )
 
+
 from .constants import dist_group_type
 from .utils import get_device_compute_capability
 from .jit import jit_fuser
@@ -279,7 +281,9 @@ def reset(cls) -> None:
     def set_skip_fp8_weight_update_tensor(cls, skip: bool) -> None:
         """`skip_fp8_weight_update_tensor` inplace setter."""
         if cls.skip_fp8_weight_update_tensor is None:
-            cls.skip_fp8_weight_update_tensor = torch.empty(1, dtype=torch.float32, device="cuda")
+            cls.skip_fp8_weight_update_tensor = torch.empty(
+                1, dtype=torch.float32, device=te_device_type()
+            )
         cls.skip_fp8_weight_update_tensor.fill_(skip)
 
     @classmethod
@@ -1067,7 +1071,7 @@ def __init__(
 
         # Allocate buffers
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
         self.scale = torch.ones(num_quantizers, dtype=torch.float32, device=device)
         self.amax_history = torch.zeros(
             recipe.amax_history_len,
@@ -1113,7 +1117,7 @@ def __init__(
 
         # Allocate buffers
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
         self.device = device
 
     def make_quantizers(self) -> list:
@@ -1153,7 +1157,7 @@ def __init__(
 
         # Allocate buffers
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
     def make_quantizers(self) -> list:
         # TODO(ksivamani); Find better design for this, adding here to avoid circular import.
@@ -1192,7 +1196,7 @@ def __init__(
 
         # Allocate buffers
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
         self.device = device
 
     def make_quantizers(self) -> list:
@@ -1293,7 +1297,7 @@ def __init__(
 
         # Allocate buffers
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
     def make_quantizers(self) -> list:
         from .tensor.nvfp4_tensor import NVFP4Quantizer
@@ -1363,7 +1367,7 @@ def __init__(
         self.mode = mode
         self.num_quantizers = num_quantizers
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
         self.device = device
 
         if getattr(recipe, "qfactory", None) is None:
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
index 48762499b9..c752501848 100644
--- a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -12,6 +12,7 @@
 from transformer_engine_torch import DType as TE_DType
 from transformer_engine_torch import Float8BlockScaleTensorFormat
 
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import Float8BlockScaling, Recipe
 from .storage.float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage
 from .quantized_tensor import (
@@ -220,7 +221,7 @@ def make_empty(
     ) -> Float8BlockwiseQTensor:
         """Construct quantized tensor with uninitialized data"""
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         data_format = (
             tex.Float8BlockScaleTensorFormat.COMPACT
@@ -451,7 +452,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
                 qt._rowwise_scale_inv,
                 qt._columnwise_scale_inv,
             ):
-                if t is not None and t.is_cuda:
+                if t is not None and t.device.type == te_device_type():
                     t.record_stream(stream)
             return None
 
@@ -542,7 +543,7 @@ def _set_data(self, tensor: torch.Tensor) -> None:
 
         """
         # Tensor device
-        new_device = tensor.device if tensor.is_cuda else self.device
+        new_device = tensor.device if tensor.device.type == te_device_type() else self.device
 
         def _set_from_tensor(dst: Float8BlockwiseQTensor, src: Float8BlockwiseQTensor):
             dst._rowwise_data = src._rowwise_data
diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py
index a4e68e53b0..ea88c7e3f2 100644
--- a/transformer_engine/pytorch/tensor/float8_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_tensor.py
@@ -11,6 +11,7 @@
 import transformer_engine_torch as tex
 from transformer_engine_torch import DType as TE_DType
 
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import DelayedScaling, Float8CurrentScaling, Recipe
 from ..utils import canonicalize_process_group, devices_match
 from .storage.float8_tensor_storage import Float8TensorStorage, _FromFloat8Func
@@ -108,7 +109,7 @@ def make_empty(
 
         # Canonicalize tensor attributes
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         # Allocate FP8 data
         data = torch.empty(shape, dtype=torch.uint8, device=device)
@@ -294,7 +295,7 @@ def make_empty(
 
         # Canonicalize tensor attributes
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         # Allocate FP8 data
         data = torch.empty(shape, dtype=torch.uint8, device=device)
@@ -682,7 +683,7 @@ def _set_data(self, tensor: torch.Tensor) -> None:
         """
 
         # Tensor device
-        new_device = tensor.device if tensor.is_cuda else self.device
+        new_device = tensor.device if tensor.device.type == te_device_type() else self.device
         if not devices_match(new_device, tensor.device):
             tensor = tensor.to(device=new_device)
 
diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
index 700de24c4e..c8dda346e9 100644
--- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py
+++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -12,6 +12,7 @@
 import transformer_engine_torch as tex
 from transformer_engine_torch import DType as TE_DType
 
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import MXFP8BlockScaling, Recipe
 from ..constants import MXFP8_BLOCK_SCALING_SIZE
 from ..utils import devices_match, round_up_to_nearest_multiple
@@ -96,7 +97,7 @@ def make_empty(
 
         # Canonicalize tensor attributes
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         assert (
             shape[-1] % MXFP8_BLOCK_SCALING_SIZE == 0
@@ -403,7 +404,7 @@ def _set_data(self, tensor: torch.Tensor) -> None:
         """
 
         # Tensor device
-        new_device = tensor.device if tensor.is_cuda else self.device
+        new_device = tensor.device if tensor.device.type == te_device_type() else self.device
         if not devices_match(new_device, tensor.device):
             tensor = tensor.to(device=new_device)
 
diff --git a/transformer_engine/pytorch/tensor/nvfp4_tensor.py b/transformer_engine/pytorch/tensor/nvfp4_tensor.py
index ca2154f554..a62873cf7c 100644
--- a/transformer_engine/pytorch/tensor/nvfp4_tensor.py
+++ b/transformer_engine/pytorch/tensor/nvfp4_tensor.py
@@ -13,6 +13,7 @@
 import transformer_engine_torch as tex
 from transformer_engine_torch import DType as TE_DType
 
+from transformer_engine import te_device_type
 from transformer_engine.common.recipe import NVFP4BlockScaling, Recipe
 from ..constants import NVFP4_BLOCK_SCALING_SIZE, dist_group_type
 from ..utils import (
@@ -96,7 +97,7 @@ def get_rht_matrix(with_random_sign_mask: bool) -> torch.Tensor:
         signs = get_no_random_sign_vector()
     sign_matrix = signs * torch.eye(hadamard_dimension, dtype=torch.float32)
     rht_matrix = sign_matrix @ get_hadamard_matrix(hadamard_dimension)
-    return rht_matrix.to(dtype=torch.bfloat16).cuda()
+    return rht_matrix.to(dtype=torch.bfloat16).to(te_device_type())
 
 
 @functools.lru_cache(maxsize=None)
@@ -267,7 +268,7 @@ def make_empty(
 
         # Canonicalize tensor attributes
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         assert shape[-1] % NVFP4_BLOCK_SCALING_SIZE == 0, (
             f"Incorrect shape {shape} for NVFP4. Tensor dims must be divisible by"
@@ -617,7 +618,7 @@ def _set_data(self, tensor: torch.Tensor) -> None:
         """
 
         # Tensor device
-        new_device = tensor.device if tensor.is_cuda else self.device
+        new_device = tensor.device if tensor.device.type == te_device_type() else self.device
         if not devices_match(new_device, tensor.device):
             tensor = tensor.to(device=new_device)
 
diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py
index 8a032b2f55..b59f7276bd 100644
--- a/transformer_engine/pytorch/transformer.py
+++ b/transformer_engine/pytorch/transformer.py
@@ -10,6 +10,7 @@
 
 import torch
 
+from transformer_engine import te_device_type
 from transformer_engine.pytorch import torch_version
 from transformer_engine.pytorch.module import LayerNormMLP, LayerNorm, RMSNorm
 from transformer_engine.debug.pytorch.debug_state import TEDebugState
@@ -311,7 +312,7 @@ def __init__(
         bias: bool = True,
         activation: str = "gelu",
         normalization: str = "LayerNorm",
-        device: Union[torch.device, str] = "cuda",
+        device: Union[torch.device, str] = te_device_type(),
         attn_input_format: str = "sbhd",
         name: str = None,
         qk_norm_type: Optional[str] = None,
diff --git a/transformer_engine/pytorch/triton/permutation.py b/transformer_engine/pytorch/triton/permutation.py
index 6292acb69b..aa1260aeac 100644
--- a/transformer_engine/pytorch/triton/permutation.py
+++ b/transformer_engine/pytorch/triton/permutation.py
@@ -13,6 +13,7 @@
 from triton.language import core
 from triton.language.standard import _log2
 
+from transformer_engine import te_device_type
 
 # The following three argsort related kernels are adapted from
 # the issue https://github.com/triton-lang/triton/issues/3698
@@ -218,10 +219,12 @@ def make_row_id_map(
         The [num_experts, num_experts + n_routed) items are the indices of the experts corresponding
         to the first n_routed row indices above.
     """
-    row_id_map = torch.empty((num_tokens, num_experts * 2 + 1), dtype=torch.int32, device="cuda")
+    row_id_map = torch.empty(
+        (num_tokens, num_experts * 2 + 1), dtype=torch.int32, device=te_device_type()
+    )
     block_size = 1024
     grid = (num_experts, triton.cdiv(num_tokens, block_size))
-    workspace_tensor = torch.empty(grid, dtype=torch.int32, device="cuda")
+    workspace_tensor = torch.empty(grid, dtype=torch.int32, device=te_device_type())
 
     # supposing num_tokens == 5, num_experts == 3, block_size == 3
     # and we have a routing_map like this:
@@ -419,15 +422,15 @@ def permute_with_mask_map(
     scale_hidden_dim: int
         Hidden size of the scale tensor.
     """
-    output = torch.empty((num_out_tokens, hidden_size), dtype=inp.dtype, device="cuda")
+    output = torch.empty((num_out_tokens, hidden_size), dtype=inp.dtype, device=te_device_type())
     if probs is not None:
-        permuted_probs = torch.empty((num_out_tokens,), dtype=probs.dtype, device="cuda")
+        permuted_probs = torch.empty((num_out_tokens,), dtype=probs.dtype, device=te_device_type())
     else:
         permuted_probs = None
 
     if scale is not None:
         permuted_scale = torch.empty(
-            (num_out_tokens, scale_hidden_dim), dtype=scale.dtype, device="cuda"
+            (num_out_tokens, scale_hidden_dim), dtype=scale.dtype, device=te_device_type()
         )
     else:
         permuted_scale = None
@@ -603,10 +606,10 @@ def unpermute_with_mask_map(
     hidden_size: int
         Hidden size of the permuted tensor.
     """
-    output = torch.empty((num_tokens, hidden_size), dtype=inp.dtype, device="cuda")
+    output = torch.empty((num_tokens, hidden_size), dtype=inp.dtype, device=te_device_type())
     if permuted_probs is not None:
         unpermuted_probs = torch.empty(
-            (num_tokens, num_experts), dtype=permuted_probs.dtype, device="cuda"
+            (num_tokens, num_experts), dtype=permuted_probs.dtype, device=te_device_type()
         )
     else:
         unpermuted_probs = None
@@ -776,10 +779,10 @@ def unpermute_with_mask_map_bwd_with_merging_probs(
         Hidden size of the output tensor.
     """
     act_grad = torch.empty(
-        (num_out_tokens, hidden_size), dtype=fwd_output_grad.dtype, device="cuda"
+        (num_out_tokens, hidden_size), dtype=fwd_output_grad.dtype, device=te_device_type()
     )
     merging_probs_grad = torch.empty(
-        (num_tokens, num_experts), dtype=merging_probs.dtype, device="cuda"
+        (num_tokens, num_experts), dtype=merging_probs.dtype, device=te_device_type()
     )
     grid = (num_tokens,)
     _unpermute_bwd_with_merging_probs_kernel[grid](
@@ -869,7 +872,7 @@ def make_chunk_sort_map(
     num_splits: int
         Number of splits of split_sizes and sorted_indices.
     """
-    row_id_map = torch.empty((num_tokens,), dtype=torch.int32, device="cuda")
+    row_id_map = torch.empty((num_tokens,), dtype=torch.int32, device=te_device_type())
     grid = (num_tokens,)
     _make_chunk_sort_map_kernel[grid](
         split_sizes,
@@ -968,9 +971,9 @@ def sort_chunks_by_map(
     is_forward: bool
         Whether the sort is for forward or backward.
     """
-    output = torch.empty((num_tokens, hidden_size), dtype=inp.dtype, device="cuda")
+    output = torch.empty((num_tokens, hidden_size), dtype=inp.dtype, device=te_device_type())
     if probs is not None:
-        permuted_probs = torch.empty((num_tokens,), dtype=probs.dtype, device="cuda")
+        permuted_probs = torch.empty((num_tokens,), dtype=probs.dtype, device=te_device_type())
     else:
         permuted_probs = None
     # pylint: disable=unnecessary-lambda-assignment
diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py
index 2be0aed4a8..7d237ac3da 100644
--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -11,6 +11,8 @@
 import numpy as np
 import torch
 
+from transformer_engine import te_device_type
+
 from . import torch_version
 from .tensor.quantized_tensor import Quantizer
 from ..debug.pytorch.debug_quantization import DebugQuantizedTensor
@@ -30,7 +32,8 @@ def requires_grad(*tensors: Tuple[Optional[torch.Tensor], ...]) -> None:
 @functools.lru_cache(maxsize=None)
 def _empty_tensor() -> torch.Tensor:
     """Get tensor with no entries and no data"""
-    return torch.Tensor().cuda()
+
+    return torch.Tensor().to(device=te_device_type())
 
 
 def clear_tensor_data(*tensors: Tuple[Optional[torch.Tensor], ...]) -> None:
@@ -516,12 +519,12 @@ def canonicalize_device(device: Optional[torch.device | str]) -> torch.device:
     if device is None:
         # Use default CUDA device
         device = torch.get_default_device()
-        if device.type != "cuda":
-            device = torch.device("cuda", torch.cuda.current_device())
+        if device.type != te_device_type():
+            device = torch.device(te_device_type(), torch.cuda.current_device())
     elif not isinstance(device, torch.device):
         device = torch.device(device)
-    if device.type == "cuda" and device.index is None:
-        device = torch.device("cuda", torch.cuda.current_device())
+    if device.type == te_device_type() and device.index is None:
+        device = torch.device(te_device_type(), torch.cuda.current_device())
     return device
 
 
@@ -543,7 +546,7 @@ def devices_match(device1: torch.device, device2: torch.device) -> bool:
     device2 = torch.device(device2)
     if device1.type != device2.type:
         return False
-    if device1.type == "cuda":
+    if device1.type == te_device_type():
         index1 = device1.index
         index2 = device2.index
         if index1 == index2:
@@ -657,12 +660,12 @@ def canonicalize_process_group(
 def torch_get_autocast_gpu_dtype() -> torch.dtype:
     """Get PyTorch autocast GPU dtype."""
     if torch_version() >= (2, 4, 0):
-        return torch.get_autocast_dtype("cuda")
+        return torch.get_autocast_dtype(te_device_type())
     return torch.get_autocast_gpu_dtype()
 
 
 if torch_version() >= (2, 4, 0):
-    gpu_autocast_ctx = functools.partial(torch.amp.autocast, device_type="cuda")
+    gpu_autocast_ctx = functools.partial(torch.amp.autocast, device_type=te_device_type())
 else:
     gpu_autocast_ctx = torch.cuda.amp.autocast
 
@@ -759,7 +762,7 @@ def convert_to_torch_tensor(tensor: Union[_WeakRefTensor, torch.Tensor]) -> torc
     if isinstance(x, torch.Tensor):
         return (
             convert_to_torch_tensor(_WeakRefTensor(x.data_ptr(), x.dtype, x.shape))
-            if x.is_cuda
+            if x.device.type == te_device_type()
             else x
         )
     if isinstance(x, tuple):

From 7f788a3f45adc319bd26ff76cd89ac9a6be20477 Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Wed, 25 Mar 2026 17:36:10 +0800
Subject: [PATCH 39/59] Add scaled_masked_softmax_forward/backward for flagos
 backend (#52)

Add two functions for flagos backend, based on flaggems

- scaled_masked_softmax_forward
- scaled_masked_softmax_backend
---
 .../plugin/core/backends/flagos/flagos.py     | 19 ++++++
 .../core/backends/flagos/impl/__init__.py     |  1 +
 .../core/backends/flagos/impl/softmax.py      | 61 +++++++++++++++++++
 .../core/backends/flagos/register_ops.py      | 16 +++++
 4 files changed, 97 insertions(+)
 create mode 100644 transformer_engine/plugin/core/backends/flagos/impl/softmax.py

diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index d33bcf1411..9c8e5a0091 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -17,6 +17,8 @@
     multi_tensor_adam_param_remainder_fl,
     multi_tensor_l2_norm_fl,
     generic_gemm_fl,
+    scaled_masked_softmax_forward_fl,
+    scaled_masked_softmax_backward_fl,
 )
 
 
@@ -160,6 +162,23 @@ def rmsnorm_bwd(
     def get_fused_attn_backend(self, *args, **kwargs) -> int:
         return NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
+    # Softmax functions
+    def scaled_masked_softmax_forward(
+        self,
+        input: torch.Tensor,
+        mask: torch.Tensor,
+        scale_factor: Union[float, torch.Tensor],
+    ) -> torch.Tensor:
+        return scaled_masked_softmax_forward_fl(input, mask, scale_factor)
+
+    def scaled_masked_softmax_backward(
+        self,
+        output_grad_: torch.Tensor,
+        softmax_results_: torch.Tensor,
+        scale_factor: float,
+    ) -> torch.Tensor:
+        return scaled_masked_softmax_backward_fl(output_grad_, softmax_results_, scale_factor)
+
     # multi-tensor functions
     def multi_tensor_scale(
         self,
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/__init__.py b/transformer_engine/plugin/core/backends/flagos/impl/__init__.py
index f17b38c9e6..d4853b6fdd 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/__init__.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/__init__.py
@@ -6,3 +6,4 @@
 from .rmsnorm import *
 from .fused_adam import *
 from .multi_tensor import *
+from .softmax import *
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/softmax.py b/transformer_engine/plugin/core/backends/flagos/impl/softmax.py
new file mode 100644
index 0000000000..31564b224f
--- /dev/null
+++ b/transformer_engine/plugin/core/backends/flagos/impl/softmax.py
@@ -0,0 +1,61 @@
+import torch
+from typing import Union
+import flag_gems
+
+
+def scaled_masked_softmax_forward_fl(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    scale_factor: Union[float, torch.Tensor],
+) -> torch.Tensor:
+    # Ensure `mask` and 'scale_factor' is on the same device as `input`.
+    if mask.device != input.device:
+        mask = flag_gems.to_copy(mask, device=input.device)
+    if isinstance(scale_factor, torch.Tensor):
+        if scale_factor.device != input.device:
+            scale_factor = flag_gems.to_copy(scale_factor, device=input.device)
+
+    # Keep semantics aligned with TE CUDA scaled_masked_softmax:
+    # - integer/bool mask: masked iff mask == 1, masked logits set to -10000.0
+    # - float mask: treated as additive bias in logit space
+    if mask.dim() == 4 and mask.size(1) == 1 and input.dim() == 4:
+        mask = mask.expand_as(input)
+
+    scaled = flag_gems.mul(input, scale_factor)
+    if mask.is_floating_point():
+        mask_f = flag_gems.to_copy(mask, device=input.device, dtype=scaled.dtype)
+        scaled = flag_gems.add(scaled, mask_f)
+        return flag_gems.softmax(scaled, dim=-1)
+
+    # Avoid using `mask == 1` (torch op) since on some devices it may fall back to CPU,
+    # which would break Triton kernels inside flag_gems.
+    cond = flag_gems.eq_scalar(mask, 1)
+    scaled = flag_gems.masked_fill(scaled, cond, -10000.0)
+    all_masked = flag_gems.all_dim(cond, dim=-1, keepdim=True)
+    out = flag_gems.softmax(scaled, dim=-1)
+    return flag_gems.masked_fill(out, all_masked, 0.0)
+
+
+def scaled_masked_softmax_backward_fl(
+    output_grad_: torch.Tensor,
+    softmax_results_: torch.Tensor,
+    scale_factor: float,
+) -> torch.Tensor:
+    orig_dtype = output_grad_.dtype
+    # Compute in float32 for numerical stability.
+    output_grad_f32 = flag_gems.to_copy(output_grad_, dtype=torch.float32)
+    softmax_output_f32 = flag_gems.to_copy(
+        softmax_results_, dtype=torch.float32, device=output_grad_.device
+    )
+    if isinstance(scale_factor, torch.Tensor):
+        if scale_factor.device != output_grad_.device:
+            scale_factor = flag_gems.to_copy(scale_factor, device=output_grad_.device)
+
+    # term = softmax_output_f32 * output_grad_f32
+    term = flag_gems.mul(softmax_output_f32, output_grad_f32)
+    # sum_term = sum(term, dim=-1, keepdim=True)
+    sum_term = flag_gems.sum_dim(term, dim=[-1], keepdim=True)
+    # grad_softmax = softmax_output_f32 * (output_grad_f32 - sum_term)
+    grad_softmax = flag_gems.mul(softmax_output_f32, flag_gems.sub(output_grad_f32, sum_term))
+    grad_scaled = flag_gems.mul(grad_softmax, scale_factor)
+    return flag_gems.to_copy(grad_scaled, dtype=orig_dtype)
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
index d744cdda41..180f5a5d35 100644
--- a/transformer_engine/plugin/core/backends/flagos/register_ops.py
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -132,6 +132,22 @@ def register_builtins(registry) -> None:
             vendor=None,
             priority=150,
         ),
+        OpImpl(
+            op_name="scaled_masked_softmax_forward",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.scaled_masked_softmax_forward, is_avail),
+            vendor=None,
+            priority=150,
+        ),
+        OpImpl(
+            op_name="scaled_masked_softmax_backward",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.scaled_masked_softmax_backward, is_avail),
+            vendor=None,
+            priority=150,
+        ),
         OpImpl(
             op_name="get_cudnn_version",
             impl_id="default.flagos",

From 1f98511427f325e634ed0ef8a2990f8f79f4eaed Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Thu, 26 Mar 2026 17:22:32 +0800
Subject: [PATCH 40/59] Fix quantizer dtype conversion errors (#54)

- Fix quantizer dtype attr conversion errors for vendor backends
- Polish logger for vendor backend
---
 transformer_engine/__init__.py                |  2 --
 .../plugin/core/backends/vendor/cuda/cuda.py  | 22 ++++++++++++++++++-
 .../core/backends/vendor/hygon/hygon.py       | 20 ++++++++++++++++-
 .../core/backends/vendor/iluvatar/iluvatar.py | 20 ++++++++++++++++-
 .../core/backends/vendor/metax/metax.py       | 20 ++++++++++++++++-
 .../plugin/core/backends/vendor/musa/musa.py  | 20 ++++++++++++++++-
 .../core/backends/vendor/musa/patches.py      |  1 +
 7 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/transformer_engine/__init__.py b/transformer_engine/__init__.py
index c3fb004659..e8bc4f5802 100644
--- a/transformer_engine/__init__.py
+++ b/transformer_engine/__init__.py
@@ -21,9 +21,7 @@
     from .plugin.core.backends.vendor.musa.patches import apply_patch as _musa_apply_patch
 
     _musa_apply_patch()
-    print("[TE-FL] MUSA patches applied")
 except Exception as e:
-    print(f"[TE-FL] MUSA patches not applied: {e}")
     pass
 
 
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index fc1f008f23..4309cc4a2e 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -79,7 +79,6 @@ def try_load_lib(name, search_patterns):
                     return True
         return False
     except Exception as e:
-        print(f"[CUDA] Failed to load CUDA libs: {e}")
         return False
 
 
@@ -90,6 +89,8 @@ def _ensure_cuda_libs():
     global _cuda_libs_loaded
     if not _cuda_libs_loaded:
         _cuda_libs_loaded = _load_cuda_libs()
+        if _cuda_libs_loaded:
+            print(f"[CUDA] Successfully loaded CUDA libs")
     return _cuda_libs_loaded
 
 
@@ -166,6 +167,15 @@ def quantize(
         noop: Optional[torch.Tensor] = None,
     ) -> Any:
         tex = self._get_tex()
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
         return tex.quantize(tensor, quantizer, output, noop)
 
     def dequantize(
@@ -183,6 +193,16 @@ def bgrad_quantize(
         quantizer: Any,
     ) -> List[Any]:
         tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
         return tex.bgrad_quantize(input, quantizer)
 
     def generic_gemm(
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index 2231ad59a4..391d39e09f 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -54,7 +54,6 @@ def _get_sys_extension() -> str:
         spec.loader.exec_module(solib)
         return True
     except Exception as e:
-        print(f"[HYGON] Failed to load hygon libs: {e}")
         return False
 
 
@@ -65,6 +64,8 @@ def _ensure_hygon_libs():
     global _hygon_libs_loaded
     if not _hygon_libs_loaded:
         _hygon_libs_loaded = _load_hygon_libs()
+        if _hygon_libs_loaded:
+            print(f"[HYGON] Successfully loaded HYGON libs")
     return _hygon_libs_loaded
 
 
@@ -145,6 +146,13 @@ def quantize(
         noop: Optional[torch.Tensor] = None,
     ) -> Any:
         tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
         return tex.quantize(tensor, quantizer, output, noop)
 
     def dequantize(
@@ -162,6 +170,16 @@ def bgrad_quantize(
         quantizer: Any,
     ) -> List[Any]:
         tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
         return tex.bgrad_quantize(input, quantizer)
 
     def generic_gemm(
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index 40c1719851..e14dea9a75 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -77,7 +77,6 @@ def try_load_lib(name, search_patterns):
                     return True
         return False
     except Exception as e:
-        print(f"[ILUVATAR] Failed to load ILUVATAR libs: {e}")
         return False
 
 
@@ -88,6 +87,8 @@ def _ensure_iluvatar_libs():
     global _iluvatar_libs_loaded
     if not _iluvatar_libs_loaded:
         _iluvatar_libs_loaded = _load_iluvatar_libs()
+        if _iluvatar_libs_loaded:
+            print(f"[ILUVATAR] Successfully loaded ILUVATAR libs")
     return _iluvatar_libs_loaded
 
 
@@ -171,6 +172,13 @@ def quantize(
         noop: Optional[torch.Tensor] = None,
     ) -> Any:
         tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
         return tex.quantize(tensor, quantizer, output, noop)
 
     def dequantize(
@@ -188,6 +196,16 @@ def bgrad_quantize(
         quantizer: Any,
     ) -> List[Any]:
         tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
         return tex.bgrad_quantize(input, quantizer)
 
     def generic_gemm(
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 460ff76db4..3c8663ff1e 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -37,7 +37,6 @@ def get_ext():
                     return True
         return False
     except Exception as e:
-        print(f"[Metax] Failed to load Metax libs: {e}")
         return False
 
 
@@ -48,6 +47,8 @@ def _ensure_metax_libs():
     global _metax_libs_loaded
     if not _metax_libs_loaded:
         _metax_libs_loaded = _load_metax_libs()
+        if _metax_libs_loaded:
+            print(f"[Metax] Successfully loaded Metax libs")
     return _metax_libs_loaded
 
 
@@ -126,6 +127,13 @@ def quantize(
         noop: Optional[torch.Tensor] = None,
     ) -> Any:
         tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
         return tex.quantize(tensor, quantizer, output, noop)
 
     def dequantize(
@@ -143,6 +151,16 @@ def bgrad_quantize(
         quantizer: Any,
     ) -> List[Any]:
         tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
         return tex.bgrad_quantize(input, quantizer)
 
     def generic_gemm(
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/musa.py b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
index 281b091079..cba8c85a79 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/musa.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
@@ -66,7 +66,6 @@ def try_load_lib(name, search_patterns):
 
         return True
     except Exception as e:
-        print(f"[MUSA] Failed to load MUSA libs: {e}")
         return False
 
 
@@ -77,6 +76,8 @@ def _ensure_musa_libs():
     global _musa_libs_loaded
     if not _musa_libs_loaded:
         _musa_libs_loaded = _load_musa_libs()
+        if _musa_libs_loaded:
+            print(f"[MUSA] Successfully loaded MUSA libs")
     return _musa_libs_loaded
 
 
@@ -138,6 +139,13 @@ def quantize(
         noop: Optional[torch.Tensor] = None,
     ) -> Any:
         tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
         return tex.quantize(tensor, quantizer, output, noop)
 
     def dequantize(
@@ -155,6 +163,16 @@ def bgrad_quantize(
         quantizer: Any,
     ) -> List[Any]:
         tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
         return tex.bgrad_quantize(input, quantizer)
 
     def generic_gemm(
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/patches.py b/transformer_engine/plugin/core/backends/vendor/musa/patches.py
index 220c5be03f..8073864d2b 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/patches.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/patches.py
@@ -70,3 +70,4 @@ def apply_patch() -> None:
         except Exception:
             # Best-effort: patching should never crash import/initialization.
             continue
+    print(f"[TE-FL] MUSA backend patches applied")

From 2188137b53abbeff513fbe5c62e294903c2f1aaf Mon Sep 17 00:00:00 2001
From: chai-xiaonan <3072824838@qq.com>
Date: Mon, 30 Mar 2026 12:48:32 +0800
Subject: [PATCH 41/59] apply flagos te_groups_gemm op (#55)

- add ```te_general_grouped_gemm``` op for flagos backend, base on
flag_gems
- support both forward and backward computation, distinguished by
```grad```
---
 .../plugin/core/backends/flagos/flagos.py     |  41 +++++
 .../plugin/core/backends/flagos/impl/gemm.py  | 105 +++++++++++
 .../core/backends/flagos/register_ops.py      |   8 +
 .../plugin/tests/test_te_general_grouped.py   | 169 ++++++++++++++++++
 4 files changed, 323 insertions(+)
 create mode 100644 transformer_engine/plugin/tests/test_te_general_grouped.py

diff --git a/transformer_engine/plugin/core/backends/flagos/flagos.py b/transformer_engine/plugin/core/backends/flagos/flagos.py
index 9c8e5a0091..1083928721 100644
--- a/transformer_engine/plugin/core/backends/flagos/flagos.py
+++ b/transformer_engine/plugin/core/backends/flagos/flagos.py
@@ -19,6 +19,7 @@
     generic_gemm_fl,
     scaled_masked_softmax_forward_fl,
     scaled_masked_softmax_backward_fl,
+    te_general_grouped_gemm_fl,
 )
 
 
@@ -118,6 +119,46 @@ def generic_gemm(
             beta,
         )
 
+    def te_general_grouped_gemm(
+        self,
+        A: List[Any],
+        transa: bool,
+        B: List[Any],
+        transb: bool,
+        D: Optional[List[torch.Tensor]],
+        D_type: DType,
+        m_splits: List[int],
+        bias: List[torch.Tensor],
+        bias_type: DType,
+        single_output: bool,
+        pre_gelu_out: List[torch.Tensor],
+        grad: bool,
+        workspace: List[torch.Tensor],
+        workspaceSizes: int,
+        accumulate: bool,
+        use_split_accumulator: bool,
+        math_sm_count: int,
+    ) -> Optional[List[torch.Tensor]]:
+        return te_general_grouped_gemm_fl(
+            A,
+            transa,
+            B,
+            transb,
+            D,
+            D_type,
+            m_splits,
+            bias,
+            bias_type,
+            single_output,
+            pre_gelu_out,
+            grad,
+            workspace,
+            workspaceSizes,
+            accumulate,
+            use_split_accumulator,
+            math_sm_count,
+        )
+
     # Other granular functions
     def rmsnorm_fwd(
         self,
diff --git a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
index 05aea25092..e190af5c5d 100644
--- a/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
+++ b/transformer_engine/plugin/core/backends/flagos/impl/gemm.py
@@ -10,6 +10,7 @@
 
 __all__ = [
     "generic_gemm_fl",
+    "te_general_grouped_gemm_fl",
 ]
 
 _DTYPE_TO_TORCH = {
@@ -115,3 +116,107 @@ def generic_gemm_fl(
         return D, bias_grad, gelu_input, extra_output_ret
     else:
         return out1, bias_grad, gelu_input, extra_output_ret
+
+
+# This function can represent both forward and backward computations.
+# When grad is False (forward computation), the 'bias' is bias;
+# When grad is True (backward computation/gradient calculation), the 'bias' is grad_bias;
+def te_general_grouped_gemm_fl(
+    B: List[torch.Tensor],
+    transb: bool,
+    A: List[torch.Tensor],
+    transa: bool,
+    D: Optional[List[torch.Tensor]],
+    D_type: Any,
+    m_splits: List[int],
+    bias: List[torch.Tensor],  # bias or grad_bias
+    bias_type: Any,
+    single_output: bool,
+    pre_gelu_out: List[torch.Tensor],
+    grad: bool,
+    workspace: List[torch.Tensor],
+    workspaceSize: int,
+    accumulate: bool,
+    use_split_accumulator: bool,
+    math_sm_count: int,
+) -> Optional[List[torch.Tensor]]:
+    if single_output and D is None:
+        raise ValueError("not implemented, D should be allocated for single output case.")
+
+    num_gemms = len(A)
+    if D is None:
+        D = []
+        for i in range(num_gemms):
+            m = A[i].shape[1] if transa else A[i].shape[0]
+            n = B[i].shape[0] if transb else B[i].shape[1]
+            D.append(torch.empty((m, n), dtype=D[i].dtype, device=A[0].device))
+
+    temp_D = []
+    for i in range(num_gemms):
+        # Handle the special case of zero-element inputs
+        if A[i].numel() == 0 or B[i].numel() == 0:
+            if not single_output:
+                if D[i].numel() != 0 and not accumulate:
+                    flag_gems.copy_(D[i], flag_gems.zeros(D[i].shape))
+            else:
+                out = flag_gems.zeros((A[i].shape[0], B[i].shape[1]))
+            if grad and len(bias) > i and bias[i] is not None and bias[i].numel() != 0:
+                flag_gems.copy_(bias[i], flag_gems.zeros(bias[i].shape))
+            if (
+                len(pre_gelu_out) > i
+                and pre_gelu_out[i] is not None
+                and pre_gelu_out[i].numel() != 0
+            ):
+                flag_gems.copy_(pre_gelu_out[i], flag_gems.zeros(pre_gelu_out[i].shape))
+            continue
+
+        a = A[i].t() if transa else A[i]
+        b = B[i].t() if transb else B[i]
+        # Determine presence of epilogue tensors
+        has_bias = len(bias) > i and bias[i] is not None and bias[i].numel() > 0
+        has_pre_gelu = (
+            len(pre_gelu_out) > i and pre_gelu_out[i] is not None and pre_gelu_out[i].numel() > 0
+        )
+
+        # Forward Pass calculation
+        if not grad:
+            if has_bias:
+                # Fused matrix multiplication and bias addition
+                out = flag_gems.addmm(bias[i], a, b)
+            else:
+                out = flag_gems.mm(a, b)
+
+            # Apply GELU epilogue if pre_gelu_out is provided
+            if has_pre_gelu:
+                flag_gems.copy_(pre_gelu_out[i], out)
+                out = flag_gems.gelu(out)
+        else:
+            out = flag_gems.mm(a, b)
+
+            # Apply dGELU epilogue if requested
+            if has_pre_gelu:
+                out = flag_gems.gelu_backward(out, pre_gelu_out[i])
+
+            # Compute bias gradients if requested
+            if has_bias:
+                bias_grad = flag_gems.sum_dim(out, dim=[0])
+                if accumulate:
+                    flag_gems.add_(bias[i], bias_grad)
+                else:
+                    flag_gems.copy_(bias[i], bias_grad)
+
+        if not single_output:
+            # Store output
+            if accumulate:
+                flag_gems.add_(D[i], out.to(D[i].dtype))
+            else:
+                flag_gems.copy_(D[i], out.to(D[i].dtype))
+        else:
+            temp_D.append(out.to(D[0].dtype))
+
+    if single_output:
+        if temp_D:
+            temp = flag_gems.cat(temp_D, dim=0)
+            flag_gems.copy_(D[0], temp)
+
+    return bias
diff --git a/transformer_engine/plugin/core/backends/flagos/register_ops.py b/transformer_engine/plugin/core/backends/flagos/register_ops.py
index 180f5a5d35..153012c501 100644
--- a/transformer_engine/plugin/core/backends/flagos/register_ops.py
+++ b/transformer_engine/plugin/core/backends/flagos/register_ops.py
@@ -66,6 +66,14 @@ def register_builtins(registry) -> None:
             vendor=None,
             priority=150,
         ),
+        OpImpl(
+            op_name="te_general_grouped_gemm",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=_bind_is_available(backend.te_general_grouped_gemm, is_avail),
+            vendor=None,
+            priority=150,
+        ),
         OpImpl(
             op_name="multi_tensor_scale",
             impl_id="default.flagos",
diff --git a/transformer_engine/plugin/tests/test_te_general_grouped.py b/transformer_engine/plugin/tests/test_te_general_grouped.py
new file mode 100644
index 0000000000..1bc815cc8b
--- /dev/null
+++ b/transformer_engine/plugin/tests/test_te_general_grouped.py
@@ -0,0 +1,169 @@
+import torch
+
+from transformer_engine.plugin.test_utils import (
+    get_available_backends,
+    get_backend,
+    TestCase,
+    generate_random_tensor,
+)
+
+
+class grouped_gemmTests(TestCase):
+    def __init__(self, device="cpu"):
+        super().__init__(
+            "Moe permute Operations",
+            "Test correctness of all moe permute operations across backends",
+        )
+        self.backends = get_available_backends()
+        self.device = device
+
+    def test_grouped_gemm_equivalence(self, grad, has_bias, has_pre_gelu, single_output):
+        print(
+            "\n test te_general_grouped_gemm"
+            f" grad:{grad} has_bias:{has_bias},has_pre_gelu:{has_pre_gelu},single_output:{single_output}"
+        )
+        import transformer_engine_torch_nv as tex
+
+        num_gemms = 2
+        m, k, n = 128, 32, 64
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        dtype = torch.float16
+
+        if dtype == torch.float16:
+            te_dtype = tex.DType.kFloat16
+        elif dtype == torch.float32:
+            te_dtype = tex.DType.kFloat32
+        elif dtype == torch.bfloat16:
+            te_dtype = tex.DType.kBFloat16
+        else:
+            raise ValueError(f"不支持的 dtype: {torch_dtype}")
+
+        torch.manual_seed(42)
+
+        A_list = [torch.randn((k, n), device=device, dtype=dtype) for _ in range(num_gemms)]
+        B_list = [torch.randn((m, k), device=device, dtype=dtype) for _ in range(num_gemms)]
+
+        bias_list_py_bias = [
+            (
+                torch.randn(n, device=device, dtype=dtype)
+                if has_bias
+                else torch.empty(0, device=device, dtype=dtype)
+            )
+            for _ in range(num_gemms)
+        ]
+        bias_list_te = [b.clone() for b in bias_list_py_bias]
+
+        pre_gelu_list_py = [
+            (
+                torch.randn(m, n, device=device, dtype=dtype)
+                if has_pre_gelu
+                else torch.empty(0, device=device, dtype=dtype)
+            )
+            for _ in range(num_gemms)
+        ]
+        pre_gelu_list_te = [p.clone() for p in pre_gelu_list_py]
+
+        if single_output:
+            D_list_py = [torch.empty(m * num_gemms, n, device=device, dtype=dtype)]
+            D_list_te = [torch.empty(m * num_gemms, n, device=device, dtype=dtype)]
+        else:
+            D_list_py = [torch.empty(m, n, device=device, dtype=dtype) for _ in range(num_gemms)]
+            D_list_te = [torch.empty(m, n, device=device, dtype=dtype) for _ in range(num_gemms)]
+        workspace_py = [torch.empty(1024 * 1024, device=device, dtype=torch.uint8)]
+        workspace_te = [torch.empty(1024 * 1024, device=device, dtype=torch.uint8)]
+
+        tex.te_general_grouped_gemm(
+            A_list,
+            False,
+            B_list,
+            False,
+            D_list_te,
+            te_dtype,
+            [],
+            bias_list_te,
+            te_dtype,
+            single_output,
+            pre_gelu_list_te,
+            grad,
+            workspace_te,
+            1024 * 1024,
+            False,
+            False,
+            0,
+        )
+
+        for backend_name in self.backends:
+            backend = get_backend(backend_name)
+            print("backend:", backend)
+            try:
+                bias_list_py = [b.clone() for b in bias_list_py_bias]
+                backend.te_general_grouped_gemm(
+                    A_list,
+                    False,
+                    B_list,
+                    False,
+                    D_list_py,
+                    te_dtype,
+                    [],
+                    bias_list_py,
+                    te_dtype,
+                    single_output,
+                    pre_gelu_list_py,
+                    grad,
+                    workspace_py,
+                    1024 * 1024,
+                    False,
+                    False,
+                    0,
+                )
+
+                for py_d, te_d in zip(D_list_py, D_list_te):
+                    self.assert_close(
+                        py_d, te_d, rtol=1e-3, atol=1e-3, msg="Output D tensors mismatch!"
+                    )
+
+                if not grad and has_pre_gelu:
+                    for py_p, te_p in zip(pre_gelu_list_py, pre_gelu_list_te):
+                        self.assert_close(
+                            py_p, te_p, rtol=1e-3, atol=1e-3, msg="Pre-GELU out tensors mismatch!"
+                        )
+
+                if grad or has_bias:
+                    for py_b, te_b in zip(bias_list_py, bias_list_te):
+                        self.assert_close(
+                            py_b, te_b, rtol=1e-3, atol=1e-3, msg="Bias gradient tensors mismatch!"
+                        )
+                print(f"    ✓ {backend_name}")
+            except NotImplementedError:
+                self.skipped += 1
+                print(f"    ⊘  {backend_name} (not implemented)")
+            except Exception as e:
+                self.failed += 1
+                print(f"    ✗ Test failed: {e}")
+
+    def run_all_tests(self):
+        print("\n" + "=" * 60)
+        print("=" * 60)
+        print(f"Available backends: {', '.join(self.backends)}")
+
+        # gemm tests
+        self.test_grouped_gemm_equivalence(False, False, False, False)
+        self.test_grouped_gemm_equivalence(False, True, False, False)
+        self.test_grouped_gemm_equivalence(False, False, True, False)
+
+        self.test_grouped_gemm_equivalence(False, False, False, True)
+        self.test_grouped_gemm_equivalence(False, True, False, True)
+        self.test_grouped_gemm_equivalence(False, False, True, True)
+        return self.report()
+
+
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    test_suite = grouped_gemmTests(device=device)
+    success = test_suite.run_all_tests()
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())

From ebcfadc81c84cc717bd118b0f876405528ea515f Mon Sep 17 00:00:00 2001
From: qqjxzxq <114602943+qqjxzxq@users.noreply.github.com>
Date: Thu, 2 Apr 2026 11:32:57 +0800
Subject: [PATCH 42/59] [CICD] support Metax MACA workflow (#48)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Description

This PR implements and integrates the **Metax (MACA)** workflow into
TransformerEngine-FL. It enables automated CI/CD pipelines, functional
training tests, and unit tests specifically optimized for Metax hardware
environments.

**Key updates in this version:** Successful TE compilation on Metax and
alignment with NVIDIA's standard QA workflows.

Fixes # (issue_number_if_applicable)

## Type of change

- [x] New feature (non-breaking change which adds functionality)
- [x] Infra/Build change (changes to CI/CD workflows or build scripts)
- [ ] Documentation change
- [ ] Bug fix
- [ ] Code refactoring

## Changes

### 1. Build & Compilation
- **TE Build Completion**: Successfully completed the compilation and
build process for TransformerEngine on the Metax platform.
- **Workflow Alignment**: Designed the Metax testing workflow based on
NVIDIA's `qa-l0-te-cpp-unittest-pytorch-lint` standard to ensure parity
with upstream quality gates.

### 2. CI/CD Infrastructure & Test Modules
- **Metax Platform Support**: Added `configs/metax.yml` to define
Metax-specific runner labels, images, and device configurations.
- **Verified Workflow Modules**: The following modules have been
implemented and verified on the Metax platform:
    - **pytorch-lint**: Static code analysis and linting.
- **pytorch-debug**: Debug-level build and basic functional
verification.
- **pytorch-unittest**: Core unit testing for Metax-adapted operators.
- **Workflow Modularization**:
- Introduced `configs/all_tests_common.yml` and
`configs/unit_tests_common.yml` for reusable test logic.
- Added `configs/all_tests_metax.yml` as the dedicated entry point for
Metax functional testing.

### 3. Environment & Runtime Fixes
- **Image Management**: Implemented `image-pull-policy: never` and
`--pull never` options to force the use of local registry images
(localhost:5000), optimizing startup time in local cluster environments.
- **Dynamic Resource Scaling**:
- Adapted `torchrun` and training scripts to support dynamic
GPU/Accelerator counts (specifically for C500 clusters).
- Removed hardcoded GPU host configurations to improve portability
across different Metax nodes.

### 4. Cleanup
- Removed legacy CUDA/Ascend specific configurations from the Metax
workflow path to prevent environment contamination.

## Hardware/Environment Verified
- **Platform**: Metax MACA
- **Accelerator**: C500
- **Registry**: Local Registry (localhost:5000)

---

## TODO / Next Steps
- [ ] Integrate the Metax-specific adaptation workflow into the central
platform.
- [ ] Generate and upload comprehensive Benchmark and Performance test
reports.

# Checklist:

- [x] I have read and followed the contributing guidelines.
- [x] The functionality is complete and verified on Metax hardware.
- [x] I have commented my code, particularly in hardware-specific
adaptation areas.
- [x] My changes generate no new warnings.
- [x] I have added/updated tests that prove my feature works on the MACA
platform.
- [x] New and existing unit tests (Lint, Debug, Unittest) pass locally
with Metax environment.

---------

Co-authored-by: 爱洗澡 qq <aixizaoqq@aixizaodeMacBook-Air.local>
Co-authored-by: zhoujiamei <2867770387@qq.com>
Co-authored-by: zhoujiamei <zjm>
Co-authored-by: peiyu <peiyu@jinglong.ai>
---
 .github/configs/ascend.yml                    |  15 +
 .github/configs/cuda.yml                      |  65 ++++
 .github/configs/metax.yml                     |  68 ++++
 .github/configs/template.yml                  |  16 +
 .github/workflows/all_tests_ascend.yml        |  32 ++
 .github/workflows/all_tests_common.yml        | 150 ++++++++
 .github/workflows/all_tests_cuda.yml          |  32 ++
 .github/workflows/all_tests_metax.yml         |  37 ++
 .github/workflows/functional_tests_common.yml | 190 ++++++++++
 .github/workflows/license.yml                 |   3 +-
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |   2 +
 .../workflows/qa-l1-te-cpp-pytorch-tests.yml  |   2 +
 .github/workflows/unit_tests_common.yml       | 334 ++++++++++++++++++
 .gitignore                                    |   4 +-
 3rdparty/cudnn-frontend                       |   2 +-
 3rdparty/cutlass                              |   2 +-
 3rdparty/googletest                           |   2 +-
 SECURITY.md                                   |   2 +-
 qa/L0_pytorch_debug_unittest/test.sh          |  70 +++-
 qa/L0_pytorch_unittest/test.sh                | 167 ++++++---
 20 files changed, 1133 insertions(+), 62 deletions(-)
 create mode 100644 .github/configs/ascend.yml
 create mode 100644 .github/configs/cuda.yml
 create mode 100644 .github/configs/metax.yml
 create mode 100644 .github/configs/template.yml
 create mode 100644 .github/workflows/all_tests_ascend.yml
 create mode 100644 .github/workflows/all_tests_common.yml
 create mode 100644 .github/workflows/all_tests_cuda.yml
 create mode 100644 .github/workflows/all_tests_metax.yml
 create mode 100644 .github/workflows/functional_tests_common.yml
 create mode 100644 .github/workflows/unit_tests_common.yml

diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml
new file mode 100644
index 0000000000..03fc5acaf5
--- /dev/null
+++ b/.github/configs/ascend.yml
@@ -0,0 +1,15 @@
+# Huawei Ascend NPU configuration
+image: ascend-infer:ubuntu18.04
+labels:
+  - npu
+  - ascend
+docker_options: |
+  --device /dev/davinci0
+  --device /dev/davinci1
+  --device /dev/davinci2
+  --device /dev/davinci3
+  --device /dev/davinci_manager
+  --device /dev/devmm_svm
+  --device /dev/hisi_hdc
+  --volume /usr/local/Ascend/driver:/usr/local/Ascend/driver
+  --volume /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons
\ No newline at end of file
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
new file mode 100644
index 0000000000..36373513de
--- /dev/null
+++ b/.github/configs/cuda.yml
@@ -0,0 +1,65 @@
+# CUDA Hardware Configuration for TransformerEngine-FL
+# Refactored for BAAI DGX A100 Nodes
+# This file defines environment variables, volumes, and test filters for TE tests.
+
+hardware_name: cuda
+display_name: 'NVIDIA CUDA (A100)'
+
+ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+
+# Runner labels for self-hosted A100 node
+runner_labels:
+  - self-hosted
+  - Linux
+  - X64
+  - nvidia
+  - gpu-8
+
+# Container volumes
+container_volumes:
+  - /home/flagscale_cicd/flask/static:/workspace/report
+  # - .:/opt/transformerengine
+  # - ./ci_logs:/logs
+  # - /home/flagscale_cicd/data:/opt/data
+
+# Container options
+container_options: >-
+  --privileged 
+  --gpus all 
+  --shm-size=500g 
+  --ipc=host 
+  --ulimit memlock=-1 
+  --ulimit stack=67108864 
+  --user root
+
+# Device types
+device_types:
+  - a100
+
+# Build environment variables (platform-specific)
+build_env:
+  TE_FL_SKIP_CUDA: '0'
+  SKIP_CUDA_BUILD: '0'
+  NVTE_WITH_CUDA: '1'
+  NVTE_WITH_MACA: '0'
+  TE_WITH_NCCL: '1'
+  NVTE_FRAMEWORK: pytorch
+  CUDA_HOME: /usr/local/cuda-12.8
+  NVCC: /usr/local/cuda-12.8/bin/nvcc
+
+# Test matrix configuration
+test_matrix:
+  l0_pytorch:
+    path: 'qa/L0_pytorch_unittest/test.sh'
+    ignored_tests:
+      - test_sanity_layernorm_mlp
+      - test_sanity_gpt
+      - test_sanity_bert
+      - test_sanity_T5
+      - test_sanity_amp_and_nvfuser
+      - test_sanity_drop_path
+      - test_layernorm_mlp_accuracy
+      - test_grouped_linear_accuracy
+      - test_gpt_accuracy
+      - test_basic_linear
+      - test_layer_norm
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
new file mode 100644
index 0000000000..e937189a55
--- /dev/null
+++ b/.github/configs/metax.yml
@@ -0,0 +1,68 @@
+# Metax Hardware Configuration for TE-FL
+# This file defines CI/CD settings for Metax-based testing
+# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+
+hardware_name: metax
+display_name: 'Metax Tests'
+
+ci_image: localhost:5000/megatron-lm-with-te:v1
+
+runner_labels:
+  - self-hosted
+  - Linux
+  - X64
+  - metax
+  - dev
+
+container_volumes:
+  - /nfs/metax_fs:/nfs/metax_fs
+  - /dev/dri:/dev/dri
+  - /dev/mxcd:/dev/mxcd
+  - /dev/infiniband:/dev/infiniband
+
+container_options: >-
+  --uts=host
+  --ipc=host
+  --privileged=true
+  --group-add video
+  --shm-size=100gb
+  --ulimit memlock=-1
+  --security-opt seccomp=unconfined
+  --security-opt apparmor=unconfined
+  --device=/dev/dri
+  --device=/dev/mxcd
+  --device=/dev/infiniband
+  --user root
+  --ulimit nofile=65535:65535
+  -e PLATFORM=metax
+  -e TORCH_DISTRIBUTED_BACKEND=mccl
+  -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
+
+build_env:
+  TE_FL_SKIP_CUDA: '1'
+  NVTE_WITH_MACA: '1'
+  CUDA_HOME: /opt/maca
+  MACA_HOME: /opt/maca
+
+# Device types to run tests on
+device_types:
+  - c500
+
+# Test matrix configuration
+test_matrix:
+  unit:
+    devices:
+      - c500
+    # Ignored test files for unit tests
+    # These files will be skipped when running pytest
+    ignored_tests:
+      # example: tests/unit_tests/test_example.py
+      # - tests/unit_tests/test_inference.py
+      # - tests/unit_tests/test_rl_utils.py
+
+  # functional:
+  #   train:
+  #     - device: c500
+  #       task: train
+  #       model: deepseek
+  #       case: tp2_pp2_ep2
diff --git a/.github/configs/template.yml b/.github/configs/template.yml
new file mode 100644
index 0000000000..c7ec56b3e9
--- /dev/null
+++ b/.github/configs/template.yml
@@ -0,0 +1,16 @@
+# Configuration Template
+# This file describes the structure for hardware-specific configurations.
+#
+# Fields:
+# - image: Docker image to use for the runner
+# - labels: List of labels for the runner
+# - docker_options: Additional Docker options for mounting devices, volumes, etc.
+#
+# Example:
+# image: <docker_image>
+# labels:
+#   - <label1>
+#   - <label2>
+# docker_options: |
+#   --option1 value1
+#   --option2 value2
\ No newline at end of file
diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml
new file mode 100644
index 0000000000..04e8f3cba0
--- /dev/null
+++ b/.github/workflows/all_tests_ascend.yml
@@ -0,0 +1,32 @@
+name: ascend_tests
+
+on:
+  # push:
+  #   branches: ["main"]
+  # pull_request:
+  #   branches: ["main"]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    # Package manager and environment settings are read from .github/configs/ascend.yml
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: ascend
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All tests passed!"
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
new file mode 100644
index 0000000000..86a85a2d6a
--- /dev/null
+++ b/.github/workflows/all_tests_common.yml
@@ -0,0 +1,150 @@
+name: Common All Tests
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+        description: Platform name (e.g., cuda, default)
+      setup_commands:
+        required: false
+        type: string
+        default: ''
+
+jobs:
+  checkout_and_config:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-latest
+    outputs:
+      ci_image: ${{ steps.config.outputs.ci_image }}
+      runs_on: ${{ steps.config.outputs.runs_on }}
+      container_volumes: ${{ steps.config.outputs.container_volumes }}
+      container_options: ${{ steps.config.outputs.container_options }}
+      device_types: ${{ steps.config.outputs.device_types }}
+      train_test_matrix: ${{ steps.config.outputs.train_test_matrix }}
+      ignored_tests: ${{ steps.config.outputs.ignored_tests }}
+      build_env: ${{ steps.config.outputs.build_env }}
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v4
+
+      - name: Check if tests should run
+        id: should_run
+        run: |
+         
+          echo "should_run=true" >> $GITHUB_OUTPUT
+
+      - name: Load platform configuration
+        id: config
+        run: |
+          set -euo pipefail
+
+          PLATFORM="${{ inputs.platform }}"
+          CONFIG_FILE=".github/configs/${PLATFORM}.yml"
+
+          # Install mikefarah/yq (v4) for YAML parsing
+          sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.45.1/yq_linux_amd64
+          sudo chmod +x /usr/local/bin/yq
+          /usr/local/bin/yq --version
+          echo "Loading configuration from $CONFIG_FILE"
+
+          # Read CI image
+          CI_IMAGE=$(yq '.ci_image' "$CONFIG_FILE")
+          echo "ci_image=$CI_IMAGE" >> $GITHUB_OUTPUT
+
+          # Read runner labels and format as JSON array
+          RUNS_ON=$(yq '.runner_labels | tojson(0)' "$CONFIG_FILE")
+          echo "runs_on=$RUNS_ON" >> $GITHUB_OUTPUT
+
+          # Read container volumes and format as JSON array
+          VOLUMES=$(yq '.container_volumes | tojson(0)' "$CONFIG_FILE")
+          echo "container_volumes=$VOLUMES" >> $GITHUB_OUTPUT
+
+          # Read container options
+          OPTIONS=$(yq '.container_options' "$CONFIG_FILE")
+          echo "container_options=$OPTIONS" >> $GITHUB_OUTPUT
+
+          # Read device types
+          DEVICE_TYPES=$(yq '.device_types | tojson(0)' "$CONFIG_FILE")
+          echo "device_types=$DEVICE_TYPES" >> $GITHUB_OUTPUT
+
+          # Read test matrix for training
+          TRAIN_MATRIX=$(yq '.test_matrix.functional.train | tojson(0)' "$CONFIG_FILE")
+          echo "train_test_matrix=$TRAIN_MATRIX" >> $GITHUB_OUTPUT
+
+          # Read ignored tests list from test_matrix.unit (default to empty array if not defined)
+          IGNORED_TESTS=$(yq '.test_matrix.unit.ignored_tests // [] | tojson(0)' "$CONFIG_FILE")
+          echo "ignored_tests=$IGNORED_TESTS" >> $GITHUB_OUTPUT
+
+          # Read build environment variables (default to empty object if not defined)
+          BUILD_ENV=$(yq '.build_env // {} | tojson(0)' "$CONFIG_FILE")
+          echo "build_env=$BUILD_ENV" >> $GITHUB_OUTPUT
+
+  unit_tests:
+    needs: checkout_and_config
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+    uses: ./.github/workflows/unit_tests_common.yml
+    name: unit_tests
+    with:
+      setup_commands: ${{ inputs.setup_commands }}
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
+      build_env: ${{ needs.checkout_and_config.outputs.build_env }}
+
+  # arguments.py not compatible with megatron-core-fl
+  # functional_tests:
+  #   needs:
+  #     - checkout_and_config
+  #   if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
+  #   uses: ./.github/workflows/functional_tests_common.yml
+  #   with:
+  #     platform: ${{ inputs.platform }}
+  #     test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
+  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
+  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+
+
+  all_tests_complete:
+    defaults:
+      run:
+        shell: bash
+    needs:
+      - checkout_and_config
+      - unit_tests
+      # - functional_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify all tests passed
+        run: |
+          # Check all test jobs (skip if not run)
+          failed=false
+
+          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
+            echo "❌ Unit tests failed"
+            failed=true
+          fi
+
+          # if [ "${{ needs.functional_tests.result }}" != "success" ]; then
+          #   echo "❌ Training functional tests failed"
+          #   failed=true
+          # fi
+
+          if [ "$failed" = "true" ]; then
+            exit 1
+          fi
+
+          echo "✅ All tests completed successfully!"
\ No newline at end of file
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
new file mode 100644
index 0000000000..b78ddf35bb
--- /dev/null
+++ b/.github/workflows/all_tests_cuda.yml
@@ -0,0 +1,32 @@
+name: cuda_tests
+
+on:
+  # push:
+  #   branches: ["main"]
+  # pull_request:
+  #   branches: ["main"]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    # Package manager and environment settings are read from .github/configs/cuda.yml
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: cuda
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All tests passed!"
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
new file mode 100644
index 0000000000..d3e496c4b2
--- /dev/null
+++ b/.github/workflows/all_tests_metax.yml
@@ -0,0 +1,37 @@
+name: metax_tests
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: metax
+      # Metax Environment Setup
+      setup_commands: |
+        export PATH=/opt/conda/bin:$PATH        
+        export LD_LIBRARY_PATH=/usr/local/maca/lib:/opt/maca/lib:$LD_LIBRARY_PATH        
+        which python3
+        python3 -m pip --version
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Metax Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All Metax tests passed!"
\ No newline at end of file
diff --git a/.github/workflows/functional_tests_common.yml b/.github/workflows/functional_tests_common.yml
new file mode 100644
index 0000000000..aa6b734778
--- /dev/null
+++ b/.github/workflows/functional_tests_common.yml
@@ -0,0 +1,190 @@
+# Disabled for compatibility issues
+name: Common Functional Tests - Training
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+        description: Platform name (e.g., cuda, default)
+      test_matrix:
+        required: true
+        type: string
+        description: JSON array of test configurations
+      image:
+        required: true
+        type: string
+      runs_on:
+        required: true
+        type: string
+      container_volumes:
+        required: true
+        type: string
+      container_options:
+        required: true
+        type: string
+
+jobs:
+  functional_test_train:
+    defaults:
+      run:
+        shell: bash
+    env:
+      PROJECT_ROOT: ${{ github.workspace }}
+    runs-on: ${{ fromJson(inputs.runs_on) }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_config: ${{ fromJson(inputs.test_matrix) }}
+    container:
+      image: ${{ inputs.image }}
+      ports:
+        - 80
+      volumes: ${{ fromJson(inputs.container_volumes) }}
+      options: ${{ inputs.container_options }}
+
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      
+      # - name: Set safe directory
+      #   run: |
+      #     git config --global --add safe.directory $PROJECT_ROOT
+      ## The above step is commented out because there is no git cli in the container, and it causes the step to fail. The safe directory is set in the next step with a conditional check.
+      - name: Set safe directory
+        run: |
+          command -v git && git config --global --add safe.directory $PROJECT_ROOT || true
+
+      - name: Activate Python environment
+        run: |
+          source /opt/conda/etc/profile.d/conda.sh
+          conda activate base   
+          echo "PATH=$PATH" >> $GITHUB_ENV
+    
+      - name: Setup Python environment
+        env:
+          NVTE_WITH_MACA: '1'
+          NVTE_WITH_CUDA: '0'
+          NVCC: /opt/maca/bin/mcc
+          CUDA_HOME: /opt/maca
+          
+          PATH: /opt/maca/bin:${{ env.PATH }}
+          LD_LIBRARY_PATH: /opt/maca/lib:${{ env.LD_LIBRARY_PATH }}
+        run: |
+          set -euo pipefail
+          cd $PROJECT_ROOT
+          pip install -e . --no-deps --no-build-isolation
+        timeout-minutes: 60
+
+      - name: L0 Pytorch Wheel
+        id: L0_pytoech_wheel
+        # timeout-minutes: 50
+        env:
+          TE_PATH: .
+          RUN_LOG: /logs/pytorch/wheel
+        run: |
+          echo "TE_PATH: ${TE_PATH}"
+          sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
+          sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh
+
+          cat qa/L0_pytorch_wheel/test.sh
+          # source /opt/miniconda3/etc/profile.d/conda.sh
+          # conda activate flagscale-train
+          pip uninstall -y transformer_engine
+
+          set -euo pipefail
+          cd $PROJECT_ROOT
+
+          PLATFORM='${{ inputs.platform }}'
+          DEVICE='${{ matrix.test_config.device }}'
+          TASK='${{ matrix.test_config.task }}'
+          MODEL='${{ matrix.test_config.model }}'
+          CASE='${{ matrix.test_config.case }}'
+
+          echo "Running functional tests for training"
+          echo "Platform: $PLATFORM"
+          echo "Device: $DEVICE"
+          echo "Task: $TASK"
+          echo "Model: $MODEL"
+          echo "Case: ${CASE:-all}"
+
+          # Set environment variables
+          export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
+          
+          set +e
+          bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log
+          exit_code=$?
+          set -e
+
+          if [ $exit_code -eq 0 ]; then
+            echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
+          else
+            echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
+          fi
+
+          echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
+          exit $exit_code
+
+      - name: Upload Installation Logs
+        if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: L0-pytorch-logs-${{ github.run_id }}
+          path: /logs/pytorch/wheel
+          retention-days: 7
+          if-no-files-found: warn
+
+      # - name: Run functional tests
+      #   id: functional_test
+      #   run: |
+      #     set -euo pipefail
+      #     cd $PROJECT_ROOT
+
+      #     PLATFORM='${{ inputs.platform }}'
+      #     DEVICE='${{ matrix.test_config.device }}'
+      #     TASK='${{ matrix.test_config.task }}'
+      #     MODEL='${{ matrix.test_config.model }}'
+      #     CASE='${{ matrix.test_config.case }}'
+
+      #     echo "Running functional tests for training"
+      #     echo "Platform: $PLATFORM"
+      #     echo "Device: $DEVICE"
+      #     echo "Task: $TASK"
+      #     echo "Model: $MODEL"
+      #     echo "Case: ${CASE:-all}"
+
+      #     # Set environment variables
+      #     export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
+
+      #     # Run functional tests via run_tests.sh with explicit platform/device/task/model/case
+      #     set +e
+      #     bash "$PROJECT_ROOT/tests/test_utils/runners/run_tests.sh" \
+      #       --platform "$PLATFORM" \
+      #       --device "$DEVICE" \
+      #       --type functional \
+      #       --task "$TASK" \
+      #       --model "$MODEL" \
+      #       --list "$CASE"
+      #     exit_code=$?
+      #     set -e
+
+      #     if [ $exit_code -eq 0 ]; then
+      #       echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
+      #     else
+      #       echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
+      #     fi
+
+      #     echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
+      #     exit $exit_code
+      #   timeout-minutes: 60
+
+      # - name: Debug - keep container alive on failure
+      #   if: failure()
+      #   run: |
+      #     echo "Container sleeping for 60 minutes for debugging..."
+      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
+      #     sleep 3600
+      #   timeout-minutes: 60
\ No newline at end of file
diff --git a/.github/workflows/license.yml b/.github/workflows/license.yml
index 3a2be6b1be..5a93e92b94 100644
--- a/.github/workflows/license.yml
+++ b/.github/workflows/license.yml
@@ -5,7 +5,8 @@
 # A workflow to trigger the TE license check on GitHub
 name: 'License'
 on:
-  pull_request: [__disabled_do_not_remove__]
+  pull_request: 
+    branches: [ "__disabled_do_not_remove__" ]
   workflow_dispatch:
 jobs:
   check:
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index 0ef8622c8a..52299cf411 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -16,6 +16,8 @@ on:
       - 'transformer_engine/**'
       - 'tests/pytorch/**'
 
+  workflow_dispatch:
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
   cancel-in-progress: true
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index d0d15d7cf8..51f071aa3b 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -26,6 +26,8 @@ on:
       - 'tests/pytorch/attention/**'
       - 'qa/L1_pytorch_onnx_unittest/**'
       - 'tests/pytorch/test_onnx_export.py'
+      
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
new file mode 100644
index 0000000000..6bfe8fd311
--- /dev/null
+++ b/.github/workflows/unit_tests_common.yml
@@ -0,0 +1,334 @@
+name: Common Unit Tests
+
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+      device:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      runs_on:
+        required: true
+        type: string
+      container_volumes:
+        required: true
+        type: string
+      container_options:
+        required: true
+        type: string
+      ignored_tests:
+        required: false
+        type: string
+        default: ''
+      # New input for hardware-specific initialization (e.g., conda activate)
+      setup_commands:
+        required: false
+        type: string
+        default: ''
+      # Platform-specific build environment variables (JSON object from config)
+      build_env:
+        required: false
+        type: string
+        default: '{}'
+      # Whether to upload coverage report
+      upload_coverage:
+        description: "Whether to upload coverage report"
+        required: false
+        type: boolean
+        default: true
+
+jobs:
+  # 1. Change Detection
+  detect_changes:
+    runs-on: ubuntu-latest
+    outputs:
+      core: ${{ steps.filter.outputs.core }}
+      qa_l0: ${{ steps.filter.outputs.qa_l0 }}
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Detect changed paths
+        id: filter
+        run: |
+          set -euo pipefail
+          BASE_REF="${{ github.event_name == 'pull_request' && format('origin/{0}', github.base_ref) || 'HEAD~1' }}"
+          [ "${{ github.event_name }}" == "pull_request" ] && git fetch origin ${{ github.base_ref }} --depth=1
+          
+          CHANGED_FILES=$(git diff --name-only $BASE_REF...HEAD 2>/dev/null || git diff --name-only $BASE_REF HEAD)
+
+          echo "core=$(echo "$CHANGED_FILES" | grep -qE "^tests/unit_tests/|^megatron/core/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
+          echo "qa_l0=$(echo "$CHANGED_FILES" | grep -qE "^qa/L0_|^transformer_engine/|^tests/pytorch/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
+  
+  # 2. Unified Test Execution
+  unit_test:
+    needs: detect_changes
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ fromJson(inputs.runs_on) }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_group:
+          - name: pytorch_lint
+            path: "qa/L0_pytorch_lint/test.sh"
+            test_type: "lint"
+          - name: pytorch_debug
+            path: "qa/L0_pytorch_debug_unittest/test.sh"
+            test_type: "debug"
+          - name: pytorch_unittest
+            path: "qa/L0_pytorch_unittest/test.sh"
+            test_type: "unittest"
+
+    name: unit-${{ inputs.device }}-${{ matrix.test_group.name }}
+    container:
+      image: ${{ inputs.image }}
+      volumes: ${{ fromJson(inputs.container_volumes) }}
+      options: --pull never ${{ inputs.container_options }}
+
+    steps:
+      - name: Check if tests should run
+        id: should_run
+        run: |
+          echo "should_run=true" >> $GITHUB_OUTPUT
+          GROUP='${{ matrix.test_group.name }}'
+          # Force run if 'full ci' label exists
+          if [ "${{ contains(github.event.pull_request.labels.*.name, 'full ci') }}" == "true" ]; then 
+            echo "should_run=true" >> $GITHUB_OUTPUT; exit 0
+          fi
+
+          if [[ "$GROUP" == "pytorch_"* ]]; then
+            CHANGED='${{ needs.detect_changes.outputs.qa_l0 }}'
+          else
+            CHANGED='${{ needs.detect_changes.outputs.core }}'
+          fi
+          
+          # For debugging, you can force this to true
+          echo "should_run=true" >> $GITHUB_OUTPUT
+
+      - name: Checkout Source Code
+        if: steps.should_run.outputs.should_run == 'true'
+        uses: actions/checkout@v4
+        with:
+          set-safe-directory: true
+
+      # - name: Activate Python environment
+      #   run: |
+      #     if [[ "$PLATFORM" == "cuda" ]] && [ -f /opt/miniconda3/etc/profile.d/conda.sh ]; then
+      #       source /opt/miniconda3/etc/profile.d/conda.sh
+      #       conda activate flagscale-train
+      #     elif [ -f /opt/conda/etc/profile.d/conda.sh ]; then
+      #       source /opt/conda/etc/profile.d/conda.sh
+      #       conda activate base
+      #     fi
+      #     echo "PATH=$PATH" >> $GITHUB_ENV
+      #     echo "Python: $(which python3) ($(python3 --version 2>&1))"
+      
+      - name: Environment Setup on Cuda
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        run: |
+          set -euo pipefail
+
+          echo "===== Step 0: Activate Python environment ====="
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+          echo "PATH=$PATH" >> $GITHUB_ENV
+          echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+          echo "===== Step 1: Remove Existing TransformerEngine ====="
+          pip uninstall transformer_engine transformer_engine_torch -y || true
+
+          echo "===== Step 2: Build & Install TransformerEngine ====="
+          cd $GITHUB_WORKSPACE
+          pip install nvdlfw-inspect --no-deps
+          pip install --no-build-isolation . -v --no-deps
+
+          echo "===== Step 3: Verify Installation ====="
+          python3 tests/pytorch/test_sanity_import.py
+
+          echo "===== Environment Setup Complete ===== "
+
+      - name: Environment Setup on Metax
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
+        run: |
+          set -euo pipefail
+
+          echo "===== Step 0: Activate Python environment ====="
+          source /opt/conda/etc/profile.d/conda.sh
+          conda activate base
+          echo "PATH=$PATH" >> $GITHUB_ENV
+          echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+          echo "===== Step 1: Base Environment Setup ====="
+          # Configure MACA toolchain paths
+          export PATH=/opt/maca/bin:$PATH
+          export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+          service ssh restart
+
+          echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+          # TransformerEngine expects nvcc, but MACA provides cucc
+          ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+          which nvcc || true
+
+          echo "===== Step 3: Install Required System Tools ====="
+          # Install essential build tools (avoid modifying Python dependencies)
+          apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
+
+          echo "===== Step 4: Remove Existing TransformerEngine ====="
+          # Prevent conflicts with preinstalled or incompatible versions
+          python3 -m pip uninstall transformer_engine -y || true
+          python3 -m pip install nvdlfw-inspect --no-deps || true
+
+          # echo "===== Step 5: Install Metax Binary Backend ====="
+          # # Install prebuilt Metax backend (required for MACA operators)
+          # WHL_PATH="/home/muxiuser/transformer_engine_metax-2.9.0-cp312-cp312-linux_x86_64.whl"
+          # if [ ! -f "$WHL_PATH" ]; then
+          #   echo "ERROR: Wheel file not found at $WHL_PATH"
+          #   echo "Please verify volume mount: -v /home/muxiuser:/home/muxiuser"
+          #   exit 1
+          # fi
+
+          # # Use --no-deps to avoid overwriting Metax-optimized PyTorch
+          # python3 -m pip install "$WHL_PATH" --no-deps --force-reinstall
+
+          # echo "===== Step 6: Verify Metax Backend ====="
+          # # Ensure transformer_engine_torch is correctly loaded
+          # python3 - <<'EOF'
+          # import transformer_engine_torch as te
+          # print("Backend loaded successfully:", te)
+          # EOF
+
+          echo "===== Step 7: Install TE-FL Plugin Layer ====="
+          # Install TransformerEngine-FL Python layer (plugin logic)
+          # cd /workspace/TransformerEngine-FL
+          cd $GITHUB_WORKSPACE
+          TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+          echo "===== Step 8: Final Verification ====="
+          # Verify both TE Python API and backend are functional
+          python3 - <<'EOF'
+          import transformer_engine
+          import transformer_engine_torch as te
+          print("transformer_engine:", transformer_engine)
+          print("transformer_engine_torch:", te)
+          EOF
+
+          echo "===== Environment Setup Complete ===== "
+      
+      - name: Execute Tests
+        if: steps.should_run.outputs.should_run == 'true'
+        working-directory: ${{ github.workspace }}
+        run: |
+          set -euo pipefail
+          ${{ inputs.setup_commands }}
+
+          # Load platform-specific environment variables
+          while IFS='=' read -r key value; do
+            [ -n "$key" ] && export "$key=$value"
+          done < <(echo '${{ inputs.build_env }}' | python3 -c "
+          import json, sys
+          env = json.load(sys.stdin)
+          for k, v in env.items():
+              print(f'{k}={v}')
+          ")
+
+          export TE_PATH=$GITHUB_WORKSPACE
+          export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
+          export PYTHONPATH=$GITHUB_WORKSPACE:${PYTHONPATH:-}
+          export PATH=${CUDA_HOME:-/usr/local/cuda}/bin:$PATH
+          export LD_LIBRARY_PATH=${CUDA_HOME:-/usr/local/cuda}/lib:${LD_LIBRARY_PATH:-}
+          
+          # check envs before running tests
+          echo "TE_PATH=$TE_PATH"
+          echo "TE_LIB_PATH=$TE_LIB_PATH"
+          echo "PYTHONPATH=$PYTHONPATH"
+          echo "PATH=$PATH"
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+          
+          # Ensure log directory exists regardless of volume mount state
+          mkdir -p /logs
+
+          # Enable coverage collection for all pytest invocations in test.sh
+          # PYTEST_ADDOPTS is automatically appended to every pytest call
+          if [ "${{ inputs.upload_coverage }}" = "true" ]; then
+            pip3 install pytest-cov 2>/dev/null || true
+            export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+          fi
+
+          if [[ "${{ matrix.test_group.name }}" == *"lint"* ]]; then
+            export CPP_ONLY=0
+            export PYTHON_ONLY=0
+          elif [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
+            # Fail fast on backend/API mismatch before running the full test group.
+            # Skip for debug group (does not use FP8/optimizer symbols).
+            python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
+          fi
+
+          bash ${{ matrix.test_group.path }}
+        timeout-minutes: 60
+
+      - name: Generate Coverage Report
+        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
+        working-directory: ${{ github.workspace }}
+        env:
+          PLATFORM: ${{ inputs.platform }}
+          DEVICE: ${{ inputs.device }}
+        run: |
+          # Install coverage (may already be present)
+          pip3 install coverage pytest-cov 2>/dev/null || true
+
+          # Merge all .coverage* files produced by sub-processes (torchrun spawns workers)
+          python3 -m coverage combine --keep 2>/dev/null || true
+          # Generate JSON coverage report (requires .coverage data from pytest --cov)
+          python3 -m coverage json -o "coverage-${PLATFORM}-${DEVICE}.json" \
+            --include="transformer_engine/*" 2>/dev/null || echo "WARNING: No coverage data found, skipping coverage-${PLATFORM}-${DEVICE}.json"
+        continue-on-error: true
+
+      - name: Upload Coverage Report
+        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
+        uses: actions/upload-artifact@v4
+        continue-on-error: true
+        with:
+          name: coverage-${{ inputs.platform }}-${{ inputs.device }}
+          path: |
+            coverage-${{ inputs.platform }}-${{ inputs.device }}.json
+
+      - name: Check FlagCICD Reachability
+        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
+        id: check_flagcicd
+        continue-on-error: true
+        run: |
+          if curl -sf --max-time 3 --connect-timeout 2 \
+               "http://flagcicd-inner.flagos.net:8000/" -o /dev/null 2>/dev/null; then
+            echo "reachable=true" >> $GITHUB_OUTPUT
+          else
+            echo "reachable=false" >> $GITHUB_OUTPUT
+            echo "INFO: flagcicd-inner.flagos.net unreachable from this runner, skipping report upload"
+          fi
+
+      - name: Upload Coverage Report to FlagCICD
+        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest' && steps.check_flagcicd.outputs.reachable == 'true'
+        uses: flagos-ai/FlagOps/actions/post-pytest-report@v2
+        continue-on-error: true
+        with:
+          backend_url: 'http://flagcicd-inner.flagos.net:8000/metrics/'
+          user_id: '000000000000000000'
+          report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}.json'
+          fail_on_error: 'false'
+
+      # - name: Debug - keep container alive on failure
+      #   if: failure()
+      #   run: |
+      #     echo "Container sleeping for 200 minutes for debugging..."
+      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
+      #     sleep 60000
+      #   timeout-minutes: 200
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 1a9a04d72d..8ef9585fd3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,4 +41,6 @@ compile_commands.json
 tensor_dumps/
 artifacts/
 # Auto-generated build configuration (specific to each environment)
-transformer_engine/plugin/core/_build_config.py
\ No newline at end of file
+transformer_engine/plugin/core/_build_config.py
+# Mac OS
+.DS_Store
\ No newline at end of file
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
index 0b1577c8c8..f0c638223e 160000
--- a/3rdparty/cudnn-frontend
+++ b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit 0b1577c8c83401237d601d0d0db5210506705396
+Subproject commit f0c638223eac20a9676941a110c9ad9e9842941d
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 57e3cfb47a..73c59c055c 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 57e3cfb47a2d9e0d46eb6335c3dc411498efa198
+Subproject commit 73c59c055c0fec87792470dbf33325158113db5e
diff --git a/3rdparty/googletest b/3rdparty/googletest
index f8d7d77c06..a35bc7693c 160000
--- a/3rdparty/googletest
+++ b/3rdparty/googletest
@@ -1 +1 @@
-Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
+Subproject commit a35bc7693c117a048152beeb34f6aac354b9423f
diff --git a/SECURITY.md b/SECURITY.md
index 35edb61b01..7a6de0d126 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -20,5 +20,5 @@ To report a potential security vulnerability in any NVIDIA product:
 While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information.
 
 ## NVIDIA Product Security
-
+## test
 For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 18199258c1..acbb440e70 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -20,18 +20,68 @@ FAIL=0
 # because it is not available on PyPI.
 pip uninstall -y nvdlfw-inspect
 pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
-
 pip install pytest==8.2.1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
-NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py -k "not (test_per_tensor_scaling or test_fake_quant or test_statistics_collection or test_statistics_multi_run)" --no-header --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
 
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | *"test_api_features.py" | *"test_sanity.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $test_path"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $test_path"
+    eval "$cmd" || FAIL=1
+}
+
+
+
+# Step 1: Sanity
+run_test_step "test_sanity.xml" "$TE_PATH/tests/pytorch/debug/test_sanity.py" \
+"pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debug/test_sanity.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS"
+
+# Step 2: Config
+run_test_step "test_config.xml" "$TE_PATH/tests/pytorch/debug/test_config.py" \
+"pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS"
+
+# Step 3: Numerics
+run_test_step "test_numerics.xml" "$TE_PATH/tests/pytorch/debug/test_numerics.py" \
+"pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS"
+
+# Step 4: Log
+run_test_step "test_log.xml" "$TE_PATH/tests/pytorch/debug/test_log.py" \
+"pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
+
+# Step 5: API Features
+run_test_step "test_api_features.xml" "$TE_PATH/tests/pytorch/debug/test_api_features.py" \
+"NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py -k \"not (test_per_tensor_scaling or test_fake_quant or test_statistics_collection or test_statistics_multi_run)\" --no-header --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
+
+# Step 6: Performance
+run_test_step "test_perf.xml" "$TE_PATH/tests/pytorch/debug/test_perf.py" \
+"pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
+
+
+
+
+# Step 7: Sanity 2
+run_test_step "test_sanity_2.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
+"NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 \
+pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py -k \"not (test_sanity_grouped_linear or test_inference_mode)\" --no-header"
 
-# standard sanity and numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py  -k "not (test_sanity_grouped_linear or test_inference_mode)" --no-header || FAIL=1
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py -k "not (test_linear_accuracy or test_layernorm_linear_accuracy or test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_transformer_layer_hidden_states_format or test_grouped_gemm)" --no-header || FAIL=1
+# Step 8: Numerics 2
+run_test_step "test_numerics_2.xml" "$TE_PATH/tests/pytorch/test_numerics.py" \
+"NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 \
+pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py -k \"not (test_linear_accuracy or test_layernorm_linear_accuracy or test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_transformer_layer_hidden_states_format or test_grouped_gemm)\" --no-header"
 
 exit $FAIL
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index 9c5d9ac86f..99a1370ac4 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -1,57 +1,132 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
+#!/bin/bash
 
-function error_exit() {
-    echo "Error: $1"
-    exit 1
-}
 
-function test_fail() {
-    RET=1
-    FAILED_CASES="$FAILED_CASES $1"
+: ${TE_PATH:=/opt/transformerengine}
+: ${XML_LOG_DIR:=/logs}
+mkdir -p "$XML_LOG_DIR"
+
+pip install pytest==8.2.1
+FAIL=0
+
+IS_CUDA_BACKEND=$(python3 -c "import torch; print('cuda' if torch.cuda.is_available() else 'cpu')" 2>/dev/null)
+
+test_fail() {
+    FAIL=1
     echo "Error: sub-test failed: $1"
 }
 
-RET=0
-FAILED_CASES=""
 
-set -x
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+    local label=$4
+
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | \
+            *"test_sanity.py" | \
+            *"test_parallel_cross_entropy.py" | \
+            *"test_cuda_graphs.py" | \
+            *"test_fused_rope.py" | \
+            *"test_gqa.py" | \
+            *"test_fused_optimizer.py" | \
+            *"test_multi_tensor.py" | \
+            *"test_cpu_offloading.py" | \
+            *"test_attention.py" | \
+            *"test_kv_cache.py" | \
+            *"test_checkpoint.py" | \
+            *"test_fused_router.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $label"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+    if [[ "$IS_CUDA_BACKEND" == *"cuda"* ]]; then
+        if [[ "$test_path" == *"test_checkpoint.py" || "$test_path" == *"test_cpu_offloading.py" || "$test_path" == *"test_attention.py" ]]; then
+            echo "-------------------------------------------------------"
+            echo "[SKIP] CUDA Backend detected: Ignoring $label"
+            echo "-------------------------------------------------------"
+            return 0
+        fi
+    fi
+
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $label"
+
+    eval "$cmd" || test_fail "$label"
+}
+
+
+# Step: Sanity
+run_test_step "pytest_test_sanity.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py -k \"not (test_sanity_layernorm_mlp or test_sanity_gpt or test_sanity_bert or test_sanity_T5 or test_sanity_amp_and_nvfuser or test_sanity_drop_path or test_sanity_fused_qkv_params or test_sanity_gradient_accumulation_fusion or test_inference_mode or test_sanity_normalization_amp or test_sanity_layernorm_linear or test_sanity_linear_with_zero_tokens or test_sanity_grouped_linear)\" --no-header" "test_sanity.py"
+
+# Step: Recipe
+run_test_step "pytest_test_recipe.xml" "$TE_PATH/tests/pytorch/test_recipe.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py" "test_recipe.py"
+
+# Step: Deferred Init
+run_test_step "pytest_test_deferred_init.xml" "$TE_PATH/tests/pytorch/test_deferred_init.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py" "test_deferred_init.py"
+
+# Step: Numerics
+run_test_step "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/test_numerics.py" \
+"PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py -k \"not (test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_gpt_cuda_graph or test_transformer_layer_hidden_states_format or test_grouped_gemm or test_noncontiguous or test_gpt_checkpointing or test_gpt_accuracy or test_mha_accuracy or test_linear_accuracy or test_linear_accuracy_delay_wgrad_compute or test_rmsnorm_accuracy or test_layernorm_accuracy or test_layernorm_linear_accuracy)\" --no-header" "test_numerics.py"
+
+# Step: CUDA Graphs
+run_test_step "pytest_test_cuda_graphs.xml" "$TE_PATH/tests/pytorch/test_cuda_graphs.py" \
+"PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py" "test_cuda_graphs.py"
+
+# Step: JIT
+run_test_step "pytest_test_jit.xml" "$TE_PATH/tests/pytorch/test_jit.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py -k \"not (test_torch_dynamo)\"" "test_jit.py"
+
+# Step: Fused Rope
+run_test_step "pytest_test_fused_rope.xml" "$TE_PATH/tests/pytorch/test_fused_rope.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py" "test_fused_rope.py"
+
+# Step: NVFP4 (Directory)
+run_test_step "pytest_test_nvfp4.xml" "$TE_PATH/tests/pytorch/nvfp4" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4" "test_nvfp4"
+
+# Step: Float8 Tensors
+run_test_step "pytest_test_float8tensor.xml" "$TE_PATH/tests/pytorch/test_float8tensor.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8tensor.xml $TE_PATH/tests/pytorch/test_float8tensor.py" "test_float8tensor.py"
+
+# Step: GQA
+run_test_step "pytest_test_gqa.xml" "$TE_PATH/tests/pytorch/test_gqa.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py" "test_gqa.py"
+
+# Step: Fused Optimizer
+run_test_step "pytest_test_fused_optimizer.xml" "$TE_PATH/tests/pytorch/test_fused_optimizer.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py" "test_fused_optimizer.py"
+
+# Step: Parallel Cross Entropy
+run_test_step "pytest_test_parallel_cross_entropy.xml" "$TE_PATH/tests/pytorch/test_parallel_cross_entropy.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py" "test_parallel_cross_entropy.py"
+
+# Step: CPU Offloading
+run_test_step "pytest_test_cpu_offloading.xml" "$TE_PATH/tests/pytorch/test_cpu_offloading.py" \
+"NVTE_FLASH_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py" "test_cpu_offloading.py"
+
+# Step: Attention
+run_test_step "pytest_test_attention.xml" "$TE_PATH/tests/pytorch/attention/test_attention.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py" "test_attention.py"
+
+# Step: Checkpoint
+run_test_step "pytest_test_checkpoint.xml" "$TE_PATH/tests/pytorch/test_checkpoint.py" \
+"NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py" "test_checkpoint.py"
 
-: ${TE_PATH:=/opt/transformerengine}
-: ${XML_LOG_DIR:=/logs}
-mkdir -p "$XML_LOG_DIR"
 
-pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
-
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py -k "not (test_sanity_layernorm_mlp or test_sanity_gpt or test_sanity_bert or test_sanity_T5 or test_sanity_amp_and_nvfuser or test_sanity_drop_path or test_sanity_fused_qkv_params or test_sanity_gradient_accumulation_fusion or test_inference_mode or test_sanity_normalization_amp or test_sanity_layernorm_linear or test_sanity_linear_with_zero_tokens or test_sanity_grouped_linear)" --no-header || test_fail "test_sanity.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py -k "not (test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_gpt_cuda_graph or test_transformer_layer_hidden_states_format or test_grouped_gemm or test_noncontiguous or test_gpt_checkpointing or test_gpt_accuracy or test_mha_accuracy or test_linear_accuracy or test_linear_accuracy_delay_wgrad_compute or test_rmsnorm_accuracy or test_layernorm_accuracy or test_layernorm_linear_accuracy)" --no-header || test_fail "test_numerics.py"
-# PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py -k "not (test_torch_dynamo)" || test_fail "test_jit.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8tensor.xml $TE_PATH/tests/pytorch/test_float8tensor.py || test_fail "test_float8tensor.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py -k "not (test_basic_linear or test_layer_norm or test_rmsnorm or test_forward_linear_bias_activation or test_backward_add_rmsnorm or test_layernorm_mlp or test_activation or test_clamped_swiglu or test_dropout or test_forward_linear_bias_add or test_forward_linear_scale_add or test_linear)" || test_fail "test_fusible_ops.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py -k "not (test_permutation_index_map or test_permutation_single_case)" || test_fail "test_permutation.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
-# NVTE_FLASH_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py"
-python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
-# NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
-# python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
-
-if [ "$RET" -ne 0 ]; then
-    echo "Error in the following test cases:$FAILED_CASES"
+if [ "$FAIL" -ne 0 ]; then
+    echo "Some tests failed."
     exit 1
 fi
-echo "All tests passed"
+echo "All assigned tests passed (some might have been skipped)."
 exit 0

From 9d1c48a831df46e04e7ef99b6d42b33ad199efff Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Thu, 9 Apr 2026 18:06:26 +0800
Subject: [PATCH 43/59] [CICD] Upload unittest coverage report to FlagCICD
 platform && Access FlagCICD runner (#58)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Description

Simplifies and consolidates the coverage report generation logic in the
CI unittest workflow, reducing redundant steps and dependencies.
Need to test **uploading reports to FlagCICD step** in CI env.

## Type of change

- [x] New feature (non-breaking change which adds functionality)
- [x] Infra/Build change (changes to CI/CD workflows or build scripts)
- [x] Code refactoring
- [ ] Documentation change
- [ ] Bug fix
- [ ] Breaking change

## Changes

- Merged `Generate Coverage Report` into the `Execute Tests` step —
coverage `combine` and `json` generation now run inline after `bash
test.sh`, following the same pattern as Megatron-LM-FL
- Coverage collection is gated on `test_type == 'unittest'` to avoid
running for lint/debug groups, and `pip install` is done only once
- Removed `fetch-depth: 0` from checkout steps (not required for unit
test runs)
- Removed unused/leftover scripts from the repository

## TODO

# Checklist:

- [x] I have read and followed the contributing guidelines.
- [x] The functionality is complete
- [x] I have commented my code, particularly in coverage report
uploading steps
- [x] My changes generate no new warnings
- [x] I have added/updated tests that prove my feature works on Cuda and
Metax platform.
- [x] New and existing unit tests pass locally on Cuda and Metax
platform.
---
 .github/configs/cuda.yml                      |  14 +-
 .github/configs/metax.yml                     |   3 +
 .github/workflows/all_tests_common.yml        |   2 +-
 .github/workflows/all_tests_cuda.yml          |   8 +-
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |   2 +-
 .github/workflows/unit_tests_common.yml       | 125 ++++++++++--------
 qa/L0_pytorch_debug_unittest/test.sh          |   2 -
 qa/L0_pytorch_lint/test.sh                    |   2 +-
 8 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 36373513de..6975fab589 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -8,18 +8,18 @@ display_name: 'NVIDIA CUDA (A100)'
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
+# runner_labels:
+# - self-hosted
+# - Linux
+# - X64
+# - nvidia
+# - gpu-8
 runner_labels:
-  - self-hosted
-  - Linux
-  - X64
-  - nvidia
-  - gpu-8
+  - nv-8g-cicd-te
 
 # Container volumes
 container_volumes:
   - /home/flagscale_cicd/flask/static:/workspace/report
-  # - .:/opt/transformerengine
-  # - ./ci_logs:/logs
   # - /home/flagscale_cicd/data:/opt/data
 
 # Container options
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
index e937189a55..e3b10c892d 100644
--- a/.github/configs/metax.yml
+++ b/.github/configs/metax.yml
@@ -6,6 +6,7 @@ hardware_name: metax
 display_name: 'Metax Tests'
 
 ci_image: localhost:5000/megatron-lm-with-te:v1
+# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
 
 runner_labels:
   - self-hosted
@@ -13,6 +14,8 @@ runner_labels:
   - X64
   - metax
   - dev
+# runner_labels:
+#   - mx-4g-cicd-te
 
 container_volumes:
   - /nfs/metax_fs:/nfs/metax_fs
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 86a85a2d6a..2165de9b49 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -92,13 +92,13 @@ jobs:
     uses: ./.github/workflows/unit_tests_common.yml
     name: unit_tests
     with:
-      setup_commands: ${{ inputs.setup_commands }}
       platform: ${{ inputs.platform }}
       device: ${{ matrix.device }}
       image: ${{ needs.checkout_and_config.outputs.ci_image }}
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      setup_commands: ${{ inputs.setup_commands }}
       ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
       build_env: ${{ needs.checkout_and_config.outputs.build_env }}
 
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index b78ddf35bb..0aa652f64b 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -1,10 +1,10 @@
 name: cuda_tests
 
 on:
-  # push:
-  #   branches: ["main"]
-  # pull_request:
-  #   branches: ["main"]
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
   workflow_dispatch:
 
 concurrency:
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index 52299cf411..b026f9aa10 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -72,7 +72,7 @@ jobs:
 
           # Install Python dependencies with version pinning
           echo "=== Installing Python Dependencies ==="
-          pip install transformers expecttest
+          pip install transformers expecttest nvdlfw-inspect --quiet
 
           # Build and install transformer_engine with verbose output
           echo "=== Building & Installing Transformer Engine ==="
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index 6bfe8fd311..615f7c9001 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -115,23 +115,46 @@ jobs:
           # For debugging, you can force this to true
           echo "should_run=true" >> $GITHUB_OUTPUT
 
-      - name: Checkout Source Code
-        if: steps.should_run.outputs.should_run == 'true'
+      # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
+      - name: Configure Git Safe Directory on Cuda
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        run: /usr/bin/git config --global safe.directory '*'
+
+      - name: Checkout Source Code on Cuda (attempt 1)
+        id: checkout1
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
         uses: actions/checkout@v4
+        continue-on-error: true
         with:
+          fetch-depth: 0
+          submodules: recursive
           set-safe-directory: true
 
-      # - name: Activate Python environment
-      #   run: |
-      #     if [[ "$PLATFORM" == "cuda" ]] && [ -f /opt/miniconda3/etc/profile.d/conda.sh ]; then
-      #       source /opt/miniconda3/etc/profile.d/conda.sh
-      #       conda activate flagscale-train
-      #     elif [ -f /opt/conda/etc/profile.d/conda.sh ]; then
-      #       source /opt/conda/etc/profile.d/conda.sh
-      #       conda activate base
-      #     fi
-      #     echo "PATH=$PATH" >> $GITHUB_ENV
-      #     echo "Python: $(which python3) ($(python3 --version 2>&1))"
+      - name: Checkout Source Code on Cuda (attempt 2)
+        id: checkout2
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 3)
+        id: checkout3
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      # Metax no need submodules
+      - name: Checkout Source Code on Metax
+        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       
       - name: Environment Setup on Cuda
         if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
@@ -149,8 +172,10 @@ jobs:
 
           echo "===== Step 2: Build & Install TransformerEngine ====="
           cd $GITHUB_WORKSPACE
-          pip install nvdlfw-inspect --no-deps
-          pip install --no-build-isolation . -v --no-deps
+
+          pip install nvdlfw-inspect --quiet
+          pip install expecttest --quiet
+          pip install . -v --no-deps --no-build-isolation
 
           echo "===== Step 3: Verify Installation ====="
           python3 tests/pytorch/test_sanity_import.py
@@ -186,7 +211,8 @@ jobs:
           echo "===== Step 4: Remove Existing TransformerEngine ====="
           # Prevent conflicts with preinstalled or incompatible versions
           python3 -m pip uninstall transformer_engine -y || true
-          python3 -m pip install nvdlfw-inspect --no-deps || true
+          python3 -m pip install nvdlfw-inspect --quiet
+          python3 -m pip install expecttest --quiet
 
           # echo "===== Step 5: Install Metax Binary Backend ====="
           # # Install prebuilt Metax backend (required for MACA operators)
@@ -229,7 +255,6 @@ jobs:
         working-directory: ${{ github.workspace }}
         run: |
           set -euo pipefail
-          ${{ inputs.setup_commands }}
 
           # Load platform-specific environment variables
           while IFS='=' read -r key value; do
@@ -257,11 +282,15 @@ jobs:
           # Ensure log directory exists regardless of volume mount state
           mkdir -p /logs
 
-          # Enable coverage collection for all pytest invocations in test.sh
-          # PYTEST_ADDOPTS is automatically appended to every pytest call
-          if [ "${{ inputs.upload_coverage }}" = "true" ]; then
-            pip3 install pytest-cov 2>/dev/null || true
-            export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+          # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
+          COVERAGE_ENABLED=false
+          if [ "${{ inputs.upload_coverage }}" = "true" ] && [ "${{ matrix.test_group.test_type }}" = "unittest" ]; then
+            if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
+              export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+              COVERAGE_ENABLED=true
+            else
+              echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
+            fi
           fi
 
           if [[ "${{ matrix.test_group.name }}" == *"lint"* ]]; then
@@ -274,55 +303,39 @@ jobs:
           fi
 
           bash ${{ matrix.test_group.path }}
-        timeout-minutes: 60
+          exit_code=$?
+
+          # Combine coverage fragments and generate JSON report
+          if [ "$COVERAGE_ENABLED" = "true" ]; then
+            python3 -m coverage combine --keep 2>/dev/null || true
+            python3 -m coverage json \
+              -o "coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json" \
+              --include="transformer_engine/*" 2>/dev/null \
+              || echo "WARNING: No coverage data found"
+          fi
 
-      - name: Generate Coverage Report
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
-        working-directory: ${{ github.workspace }}
-        env:
-          PLATFORM: ${{ inputs.platform }}
-          DEVICE: ${{ inputs.device }}
-        run: |
-          # Install coverage (may already be present)
-          pip3 install coverage pytest-cov 2>/dev/null || true
-
-          # Merge all .coverage* files produced by sub-processes (torchrun spawns workers)
-          python3 -m coverage combine --keep 2>/dev/null || true
-          # Generate JSON coverage report (requires .coverage data from pytest --cov)
-          python3 -m coverage json -o "coverage-${PLATFORM}-${DEVICE}.json" \
-            --include="transformer_engine/*" 2>/dev/null || echo "WARNING: No coverage data found, skipping coverage-${PLATFORM}-${DEVICE}.json"
-        continue-on-error: true
+          exit $exit_code
+        timeout-minutes: 60
 
       - name: Upload Coverage Report
         if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: actions/upload-artifact@v4
         continue-on-error: true
         with:
-          name: coverage-${{ inputs.platform }}-${{ inputs.device }}
+          name: coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}
           path: |
-            coverage-${{ inputs.platform }}-${{ inputs.device }}.json
-
-      - name: Check FlagCICD Reachability
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
-        id: check_flagcicd
-        continue-on-error: true
-        run: |
-          if curl -sf --max-time 3 --connect-timeout 2 \
-               "http://flagcicd-inner.flagos.net:8000/" -o /dev/null 2>/dev/null; then
-            echo "reachable=true" >> $GITHUB_OUTPUT
-          else
-            echo "reachable=false" >> $GITHUB_OUTPUT
-            echo "INFO: flagcicd-inner.flagos.net unreachable from this runner, skipping report upload"
-          fi
+            coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json
 
       - name: Upload Coverage Report to FlagCICD
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest' && steps.check_flagcicd.outputs.reachable == 'true'
+        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: flagos-ai/FlagOps/actions/post-pytest-report@v2
         continue-on-error: true
+        env:
+          NO_PROXY: "flagcicd-inner.flagos.net"
         with:
           backend_url: 'http://flagcicd-inner.flagos.net:8000/metrics/'
           user_id: '000000000000000000'
-          report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}.json'
+          report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json'
           fail_on_error: 'false'
 
       # - name: Debug - keep container alive on failure
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index acbb440e70..5be88dfe4a 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -18,8 +18,6 @@ FAIL=0
 
 # It is not installed as a requirement,
 # because it is not available on PyPI.
-pip uninstall -y nvdlfw-inspect
-pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
 pip install pytest==8.2.1
 
 run_test_step() {
diff --git a/qa/L0_pytorch_lint/test.sh b/qa/L0_pytorch_lint/test.sh
index e2c50c445e..c401f39eb1 100644
--- a/qa/L0_pytorch_lint/test.sh
+++ b/qa/L0_pytorch_lint/test.sh
@@ -6,7 +6,7 @@ set -e
 
 : "${TE_PATH:=/opt/transformerengine}"
 
-pip3 install cpplint==1.6.0 pylint==3.3.1
+pip3 install cpplint==1.6.0 pylint==3.3.4
 if [ -z "${PYTHON_ONLY}" ]
 then
   cd $TE_PATH

From 46b77e4fb5677e72bec202829a724bf025561c02 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Mon, 13 Apr 2026 15:01:43 +0800
Subject: [PATCH 44/59] plugin: sync plugin APIs with upstream csrc changes

Updated plugin OP API layer to match pytorch/csrc/ pybind changes
between base and dev branches. Changes applied to:
- ops.py base class (TEFLBackendBase)
- All 5 vendor backends (cuda, iluvatar, metax, musa, hygon)
- All 5 vendor register_ops.py files
- Scanned flagos/reference backends for changed interfaces (no changes needed)

New APIs added: group_quantize, bgrad_group_quantize, glu, dglu,
te_general_grouped_gemm_for_grouped_tensor, te_general_grouped_gemm_for_discrete_in,
te_general_grouped_gemm_for_discrete_out, nvfp4_data_transpose, swizzle_scales_for_gemm_,
grouped_swizzle_for_gemm, convert_host_pointers_to_tensor,
get_device_pointer_for_data_and_scales, splits_to_offsets,
mxfp8_scaling_compute_partial_amax, mxfp8_scaling_partial_cast,
nvfp4_2d_compute_partial_amax, nvfp4_multi_tensor_compute_partial_amax,
nvfp4_compute_global_scale, nvfp4_compute_per_block_scale, nvfp4_expand_scale_to_fp8,
nvfp4_fused_scale, nvfp4_multi_tensor_fused_scale, nvfp4_2d_partial_cast,
nvfp4_multi_tensor_2d_partial_cast, nvfp4_2d_multi_tensor_transpose,
multi_tensor_scale_tensor, multi_tensor_compute_scale_inv_e8m0

Modified APIs: split_quantize (added disable_bulk_allocation param)
---
 SYNC_POINT.md                                 |   6 +
 transformer_engine/common/__init__.py         |   1 -
 .../plugin/core/backends/vendor/cuda/cuda.py  | 278 +++++++++++++++++-
 .../core/backends/vendor/cuda/register_ops.py | 218 ++++++++++++++
 .../core/backends/vendor/hygon/hygon.py       | 141 ++++++++-
 .../backends/vendor/hygon/register_ops.py     | 216 ++++++++++++++
 .../core/backends/vendor/iluvatar/iluvatar.py | 139 ++++++++-
 .../backends/vendor/iluvatar/register_ops.py  | 218 ++++++++++++++
 .../core/backends/vendor/metax/metax.py       | 141 ++++++++-
 .../backends/vendor/metax/register_ops.py     | 218 ++++++++++++++
 .../plugin/core/backends/vendor/musa/musa.py  | 158 +++++++++-
 .../core/backends/vendor/musa/register_ops.py | 216 ++++++++++++++
 transformer_engine/plugin/core/ops.py         | 242 +++++++++++++++
 transformer_engine/pytorch/permutation.py     |  62 ++--
 .../pytorch/triton/permutation.py             |   8 +-
 15 files changed, 2236 insertions(+), 26 deletions(-)
 create mode 100644 SYNC_POINT.md

diff --git a/SYNC_POINT.md b/SYNC_POINT.md
new file mode 100644
index 0000000000..c321233330
--- /dev/null
+++ b/SYNC_POINT.md
@@ -0,0 +1,6 @@
+# Upstream Sync Point
+- Upstream: Nvidia/TransformerEngine
+- Branch: release_v2.14
+- Commit SHA: f031cf87bd054c7558b887df7bed93975456667f
+- Sync Date: 2025-07-17
+- Synced By: lixianduo
diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index fa0fd54966..bbe55151e8 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -401,7 +401,6 @@ def _load_core_library():
 if "NVTE_PROJECT_BUILDING" not in os.environ or bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
     sanity_checks_for_pypi_installation()
 
-
     # Skip loading CUDA libraries if CUDA build was skipped (FL-only mode)
     if not skip_cuda_build():
         _CUDNN_LIB_CTYPES = _load_cudnn()
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 4309cc4a2e..c9a3457902 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -205,6 +205,40 @@ def bgrad_quantize(
 
         return tex.bgrad_quantize(input, quantizer)
 
+    def group_quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        num_tensors: int,
+        first_dims: List[int],
+    ) -> Any:
+        tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+        return tex.group_quantize(tensor, quantizer, num_tensors, first_dims)
+
+    def bgrad_group_quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        num_tensors: int,
+        first_dims: List[int],
+    ) -> Any:
+        tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+        return tex.bgrad_group_quantize(tensor, quantizer, num_tensors, first_dims)
+
     def generic_gemm(
         self,
         A: Any,
@@ -260,6 +294,11 @@ def generic_gemm(
             beta,
         )
 
+    # GLU #
+    def glu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.glu(input, quantizer)
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
@@ -313,6 +352,11 @@ def clamped_swiglu(
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
 
+    # Backward of GLU #
+    def dglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dglu(grad, fwd_input, quantizer)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
@@ -607,9 +651,10 @@ def split_quantize(
         tensor: torch.Tensor,
         split_sections: List[int],
         quantizer_list: List[Any],
+        disable_bulk_allocation: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.split_quantize(tensor, split_sections, quantizer_list)
+        return tex.split_quantize(tensor, split_sections, quantizer_list, disable_bulk_allocation)
 
     def te_general_grouped_gemm(
         self,
@@ -654,6 +699,18 @@ def te_general_grouped_gemm(
             math_sm_count,
         )
 
+    def te_general_grouped_gemm_for_grouped_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_grouped_tensor(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_in(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_in(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_out(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_out(*args, **kwargs)
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -672,6 +729,55 @@ def swap_first_dims(
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
 
+    def nvfp4_data_transpose(
+        self,
+        input: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.nvfp4_data_transpose(input, out=out)
+
+    def swizzle_scales_for_gemm_(self, tensor: torch.Tensor) -> None:
+        tex = self._get_tex()
+        return tex.swizzle_scales_for_gemm_(tensor)
+
+    def grouped_swizzle_for_gemm(
+        self,
+        tensor: Any,
+        rowwise: bool,
+        columnwise: bool,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.grouped_swizzle_for_gemm(tensor, rowwise, columnwise)
+
+    def convert_host_pointers_to_tensor(
+        self,
+        tensor_lists: List[List[torch.Tensor]],
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.convert_host_pointers_to_tensor(tensor_lists)
+
+    def get_device_pointer_for_data_and_scales(
+        self,
+        data_tensors: List[torch.Tensor],
+        scale_tensors: List[torch.Tensor],
+        swizzle: bool = False,
+        rowwise: bool = True,
+        data_dtype: Any = None,
+    ) -> Any:
+        tex = self._get_tex()
+        return tex.get_device_pointer_for_data_and_scales(
+            data_tensors, scale_tensors, swizzle, rowwise, data_dtype
+        )
+
+    def splits_to_offsets(
+        self,
+        first_dims: List[int],
+        logical_last_dim: int,
+    ) -> torch.Tensor:
+        tex = self._get_tex()
+        return tex.splits_to_offsets(first_dims, logical_last_dim)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -780,6 +886,154 @@ def fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
 
+    # MXFP8 scaling
+    def mxfp8_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
+
+    def mxfp8_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
+        tex = self._get_tex()
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.mxfp8_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+
+    # NVFP4 2D
+    def nvfp4_2d_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int = 16,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_2d_compute_partial_amax(tensor, amax, h, w, start_offset, block_len)
+
+    def nvfp4_multi_tensor_compute_partial_amax(
+        self,
+        master_weight_list: List[torch.Tensor],
+        partial_amax_list: List[torch.Tensor],
+        global_amax_list: List[torch.Tensor],
+        h_list: List[int],
+        w_list: List[int],
+        start_offset_list: List[int],
+        block_len: int = 16,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_compute_partial_amax(
+            master_weight_list,
+            partial_amax_list,
+            global_amax_list,
+            h_list,
+            w_list,
+            start_offset_list,
+            block_len,
+        )
+
+    def nvfp4_compute_global_scale(
+        self,
+        global_amaxes: torch.Tensor,
+        global_scale_tensor: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_compute_global_scale(global_amaxes, global_scale_tensor)
+
+    def nvfp4_compute_per_block_scale(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_compute_per_block_scale(*args, **kwargs)
+
+    def nvfp4_expand_scale_to_fp8(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_expand_scale_to_fp8(*args, **kwargs)
+
+    def nvfp4_fused_scale(self, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_fused_scale(*args, **kwargs)
+
+    def nvfp4_multi_tensor_fused_scale(
+        self,
+        block_amax_list: List[torch.Tensor],
+        global_amax_list: List[torch.Tensor],
+        per_block_scale_list: List[torch.Tensor],
+        target_scale_list: List[torch.Tensor],
+        target_amax_list: List[torch.Tensor],
+        tile_rows_list: List[int],
+        tile_cols_list: List[int],
+        rows_padded_list: List[int],
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_fused_scale(
+            block_amax_list,
+            global_amax_list,
+            per_block_scale_list,
+            target_scale_list,
+            target_amax_list,
+            tile_rows_list,
+            tile_cols_list,
+            rows_padded_list,
+            block_len,
+        )
+
+    def nvfp4_2d_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        global_scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int = 16,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_2d_partial_cast(
+            inp, out, scale, global_scale, h, w, start_offset, block_len
+        )
+
+    def nvfp4_multi_tensor_2d_partial_cast(self, inp_list, *args, **kwargs) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_2d_partial_cast(inp_list, *args, **kwargs)
+
+    def nvfp4_2d_multi_tensor_transpose(
+        self,
+        rowwise_data_list: List[torch.Tensor],
+        columnwise_data_list: List[torch.Tensor],
+        rowwise_scale_inv_list: List[torch.Tensor],
+        columnwise_scale_inv_list: List[torch.Tensor],
+        M_list: List[int],
+        K_list: List[int],
+    ) -> None:
+        tex = self._get_tex()
+        return tex.nvfp4_2d_multi_tensor_transpose(
+            rowwise_data_list,
+            columnwise_data_list,
+            rowwise_scale_inv_list,
+            columnwise_scale_inv_list,
+            M_list,
+            K_list,
+        )
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -1366,6 +1620,16 @@ def multi_tensor_scale(
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
 
+    def multi_tensor_scale_tensor(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_scale_tensor(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1584,6 +1848,18 @@ def multi_tensor_compute_scale_and_scale_inv(
             chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
+    def multi_tensor_compute_scale_inv_e8m0(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_compute_scale_inv_e8m0(
+            chunk_size, noop_flag, tensor_lists, block_len
+        )
+
     # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
index ca65c0d384..5fac3e34c4 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/register_ops.py
@@ -105,6 +105,30 @@ def register_builtins(registry) -> None:
             vendor="CUDA",
             priority=100,
         ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_grouped_tensor",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_grouped_tensor, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_in",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_in, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_out",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_out, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Quantization
         OpImpl(
             op_name="quantize",
@@ -130,6 +154,22 @@ def register_builtins(registry) -> None:
             vendor="CUDA",
             priority=100,
         ),
+        OpImpl(
+            op_name="group_quantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.group_quantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_group_quantize",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_group_quantize, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         OpImpl(
             op_name="split_quantize",
             impl_id="vendor.cuda",
@@ -139,6 +179,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Forward
+        OpImpl(
+            op_name="glu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.glu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         OpImpl(
             op_name="gelu",
             impl_id="vendor.cuda",
@@ -228,6 +276,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Backward
+        OpImpl(
+            op_name="dglu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dglu, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         OpImpl(
             op_name="dgelu",
             impl_id="vendor.cuda",
@@ -638,6 +694,54 @@ def register_builtins(registry) -> None:
             vendor="CUDA",
             priority=100,
         ),
+        OpImpl(
+            op_name="nvfp4_data_transpose",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_data_transpose, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swizzle_scales_for_gemm_",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swizzle_scales_for_gemm_, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="grouped_swizzle_for_gemm",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.grouped_swizzle_for_gemm, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_host_pointers_to_tensor",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_host_pointers_to_tensor, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_device_pointer_for_data_and_scales",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_device_pointer_for_data_and_scales, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="splits_to_offsets",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.splits_to_offsets, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         OpImpl(
             op_name="compute_amax",
             impl_id="vendor.cuda",
@@ -670,6 +774,104 @@ def register_builtins(registry) -> None:
             vendor="CUDA",
             priority=100,
         ),
+        # MXFP8 scaling
+        OpImpl(
+            op_name="mxfp8_scaling_compute_partial_amax",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_compute_partial_amax, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="mxfp8_scaling_partial_cast",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_partial_cast, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        # NVFP4 2D
+        OpImpl(
+            op_name="nvfp4_2d_compute_partial_amax",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_compute_partial_amax, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_compute_partial_amax",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_compute_partial_amax, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_global_scale",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_global_scale, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_per_block_scale",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_per_block_scale, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_expand_scale_to_fp8",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_expand_scale_to_fp8, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_fused_scale",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_fused_scale, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_fused_scale",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_fused_scale, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_partial_cast",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_partial_cast, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_2d_partial_cast",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_2d_partial_cast, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_multi_tensor_transpose",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_multi_tensor_transpose, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Padding operations
         OpImpl(
             op_name="fused_multi_row_padding",
@@ -819,6 +1021,14 @@ def register_builtins(registry) -> None:
             vendor="CUDA",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_scale_tensor",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale_tensor, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         OpImpl(
             op_name="multi_tensor_l2norm",
             impl_id="vendor.cuda",
@@ -891,6 +1101,14 @@ def register_builtins(registry) -> None:
             vendor="CUDA",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_inv_e8m0",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_inv_e8m0, is_avail),
+            vendor="CUDA",
+            priority=100,
+        ),
         # Communication overlap operations
         OpImpl(
             op_name="bulk_overlap_ag_with_external_gemm",
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index 391d39e09f..a08a2bf434 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -182,6 +182,42 @@ def bgrad_quantize(
 
         return tex.bgrad_quantize(input, quantizer)
 
+    def group_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
+        return tex.group_quantize(input, quantizer)
+
+    def bgrad_group_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
+        return tex.bgrad_group_quantize(input, quantizer)
+
     def generic_gemm(
         self,
         A: Any,
@@ -237,6 +273,11 @@ def generic_gemm(
             beta,
         )
 
+    # GLU #
+    def glu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.glu(input, quantizer)
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
@@ -290,6 +331,11 @@ def clamped_swiglu(
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
 
+    # Backward of GLU #
+    def dglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dglu(grad, fwd_input, quantizer)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
@@ -584,9 +630,10 @@ def split_quantize(
         tensor: torch.Tensor,
         split_sections: List[int],
         quantizer_list: List[Any],
+        disable_bulk_allocation: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.split_quantize(tensor, split_sections, quantizer_list)
+        return tex.split_quantize(tensor, split_sections, quantizer_list, disable_bulk_allocation)
 
     def te_general_grouped_gemm(
         self,
@@ -631,6 +678,18 @@ def te_general_grouped_gemm(
             math_sm_count,
         )
 
+    def te_general_grouped_gemm_for_grouped_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_grouped_tensor(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_in(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_in(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_out(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_out(*args, **kwargs)
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -649,6 +708,30 @@ def swap_first_dims(
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
 
+    def nvfp4_data_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_data_transpose(*args, **kwargs)
+
+    def swizzle_scales_for_gemm_(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.swizzle_scales_for_gemm_(*args, **kwargs)
+
+    def grouped_swizzle_for_gemm(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.grouped_swizzle_for_gemm(*args, **kwargs)
+
+    def convert_host_pointers_to_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.convert_host_pointers_to_tensor(*args, **kwargs)
+
+    def get_device_pointer_for_data_and_scales(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.get_device_pointer_for_data_and_scales(*args, **kwargs)
+
+    def splits_to_offsets(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.splits_to_offsets(*args, **kwargs)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -757,6 +840,54 @@ def fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
 
+    def mxfp8_scaling_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_compute_partial_amax(*args, **kwargs)
+
+    def mxfp8_scaling_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_partial_cast(*args, **kwargs)
+
+    def nvfp4_2d_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_multi_tensor_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_compute_global_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_global_scale(*args, **kwargs)
+
+    def nvfp4_compute_per_block_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_per_block_scale(*args, **kwargs)
+
+    def nvfp4_expand_scale_to_fp8(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_expand_scale_to_fp8(*args, **kwargs)
+
+    def nvfp4_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_fused_scale(*args, **kwargs)
+
+    def nvfp4_multi_tensor_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_fused_scale(*args, **kwargs)
+
+    def nvfp4_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_multi_tensor_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_2d_multi_tensor_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_multi_tensor_transpose(*args, **kwargs)
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -1343,6 +1474,10 @@ def multi_tensor_scale(
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
 
+    def multi_tensor_scale_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.multi_tensor_scale_tensor(*args, **kwargs)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1561,6 +1696,10 @@ def multi_tensor_compute_scale_and_scale_inv(
             chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
+    def multi_tensor_compute_scale_inv_e8m0(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.multi_tensor_compute_scale_inv_e8m0(*args, **kwargs)
+
     # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
index 8221285219..2b0bbc8aa0 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/register_ops.py
@@ -105,6 +105,30 @@ def register_builtins(registry) -> None:
             vendor="HYGON",
             priority=100,
         ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_grouped_tensor",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_grouped_tensor, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_in",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_in, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_out",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_out, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Quantization
         OpImpl(
             op_name="quantize",
@@ -130,6 +154,22 @@ def register_builtins(registry) -> None:
             vendor="HYGON",
             priority=100,
         ),
+        OpImpl(
+            op_name="group_quantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.group_quantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_group_quantize",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_group_quantize, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         OpImpl(
             op_name="split_quantize",
             impl_id="vendor.hygon",
@@ -139,6 +179,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Forward
+        OpImpl(
+            op_name="glu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.glu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         OpImpl(
             op_name="gelu",
             impl_id="vendor.hygon",
@@ -228,6 +276,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Backward
+        OpImpl(
+            op_name="dglu",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dglu, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         OpImpl(
             op_name="dgelu",
             impl_id="vendor.hygon",
@@ -614,6 +670,54 @@ def register_builtins(registry) -> None:
             vendor="HYGON",
             priority=100,
         ),
+        OpImpl(
+            op_name="nvfp4_data_transpose",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_data_transpose, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swizzle_scales_for_gemm_",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swizzle_scales_for_gemm_, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="grouped_swizzle_for_gemm",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.grouped_swizzle_for_gemm, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_host_pointers_to_tensor",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_host_pointers_to_tensor, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_device_pointer_for_data_and_scales",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_device_pointer_for_data_and_scales, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="splits_to_offsets",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.splits_to_offsets, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         OpImpl(
             op_name="compute_amax",
             impl_id="vendor.hygon",
@@ -646,6 +750,102 @@ def register_builtins(registry) -> None:
             vendor="HYGON",
             priority=100,
         ),
+        OpImpl(
+            op_name="mxfp8_scaling_compute_partial_amax",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_compute_partial_amax, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="mxfp8_scaling_partial_cast",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_partial_cast, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_compute_partial_amax",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_compute_partial_amax, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_compute_partial_amax",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_compute_partial_amax, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_global_scale",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_global_scale, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_per_block_scale",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_per_block_scale, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_expand_scale_to_fp8",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_expand_scale_to_fp8, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_fused_scale",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_fused_scale, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_fused_scale",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_fused_scale, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_partial_cast",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_partial_cast, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_2d_partial_cast",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_2d_partial_cast, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_multi_tensor_transpose",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_multi_tensor_transpose, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Padding operations
         OpImpl(
             op_name="fused_multi_row_padding",
@@ -755,6 +955,14 @@ def register_builtins(registry) -> None:
             vendor="HYGON",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_scale_tensor",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale_tensor, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         OpImpl(
             op_name="multi_tensor_l2norm",
             impl_id="vendor.hygon",
@@ -827,6 +1035,14 @@ def register_builtins(registry) -> None:
             vendor="HYGON",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_inv_e8m0",
+            impl_id="vendor.hygon",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_inv_e8m0, is_avail),
+            vendor="HYGON",
+            priority=100,
+        ),
         # Communication overlap operations
         OpImpl(
             op_name="bulk_overlap_ag_with_external_gemm",
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index e14dea9a75..305d9bb977 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -208,6 +208,42 @@ def bgrad_quantize(
 
         return tex.bgrad_quantize(input, quantizer)
 
+    def group_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
+        return tex.group_quantize(input, quantizer)
+
+    def bgrad_group_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
+        return tex.bgrad_group_quantize(input, quantizer)
+
     def generic_gemm(
         self,
         A: Any,
@@ -264,6 +300,10 @@ def generic_gemm(
         )
 
     # GELU and variants #
+    def glu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.glu(input, quantizer)
+
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
@@ -317,6 +357,10 @@ def clamped_swiglu(
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
 
     # Backward of GELU and variants #
+    def dglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dglu(grad, fwd_input, quantizer)
+
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
@@ -610,9 +654,10 @@ def split_quantize(
         tensor: torch.Tensor,
         split_sections: List[int],
         quantizer_list: List[Any],
+        disable_bulk_allocation: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.split_quantize(tensor, split_sections, quantizer_list)
+        return tex.split_quantize(tensor, split_sections, quantizer_list, disable_bulk_allocation)
 
     def te_general_grouped_gemm(
         self,
@@ -657,6 +702,18 @@ def te_general_grouped_gemm(
             math_sm_count,
         )
 
+    def te_general_grouped_gemm_for_grouped_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_grouped_tensor(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_in(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_in(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_out(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_out(*args, **kwargs)
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -675,6 +732,30 @@ def swap_first_dims(
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
 
+    def nvfp4_data_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_data_transpose(*args, **kwargs)
+
+    def swizzle_scales_for_gemm_(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.swizzle_scales_for_gemm_(*args, **kwargs)
+
+    def grouped_swizzle_for_gemm(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.grouped_swizzle_for_gemm(*args, **kwargs)
+
+    def convert_host_pointers_to_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.convert_host_pointers_to_tensor(*args, **kwargs)
+
+    def get_device_pointer_for_data_and_scales(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.get_device_pointer_for_data_and_scales(*args, **kwargs)
+
+    def splits_to_offsets(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.splits_to_offsets(*args, **kwargs)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -783,6 +864,54 @@ def fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
 
+    def mxfp8_scaling_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_compute_partial_amax(*args, **kwargs)
+
+    def mxfp8_scaling_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_partial_cast(*args, **kwargs)
+
+    def nvfp4_2d_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_multi_tensor_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_compute_global_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_global_scale(*args, **kwargs)
+
+    def nvfp4_compute_per_block_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_per_block_scale(*args, **kwargs)
+
+    def nvfp4_expand_scale_to_fp8(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_expand_scale_to_fp8(*args, **kwargs)
+
+    def nvfp4_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_fused_scale(*args, **kwargs)
+
+    def nvfp4_multi_tensor_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_fused_scale(*args, **kwargs)
+
+    def nvfp4_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_multi_tensor_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_2d_multi_tensor_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_multi_tensor_transpose(*args, **kwargs)
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -1369,6 +1498,10 @@ def multi_tensor_scale(
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
 
+    def multi_tensor_scale_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.multi_tensor_scale_tensor(*args, **kwargs)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1587,6 +1720,10 @@ def multi_tensor_compute_scale_and_scale_inv(
             chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
+    def multi_tensor_compute_scale_inv_e8m0(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.multi_tensor_compute_scale_inv_e8m0(*args, **kwargs)
+
     # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
index f41724e3e2..001f6129d8 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/register_ops.py
@@ -105,6 +105,30 @@ def register_builtins(registry) -> None:
             vendor="Iluvatar",
             priority=100,
         ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_grouped_tensor",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_grouped_tensor, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_in",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_in, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_out",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_out, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Quantization
         OpImpl(
             op_name="quantize",
@@ -130,6 +154,22 @@ def register_builtins(registry) -> None:
             vendor="Iluvatar",
             priority=100,
         ),
+        OpImpl(
+            op_name="group_quantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.group_quantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_group_quantize",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_group_quantize, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         OpImpl(
             op_name="split_quantize",
             impl_id="vendor.iluvatar",
@@ -139,6 +179,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Forward
+        OpImpl(
+            op_name="glu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.glu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         OpImpl(
             op_name="gelu",
             impl_id="vendor.iluvatar",
@@ -228,6 +276,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Backward
+        OpImpl(
+            op_name="dglu",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dglu, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         OpImpl(
             op_name="dgelu",
             impl_id="vendor.iluvatar",
@@ -638,6 +694,54 @@ def register_builtins(registry) -> None:
             vendor="Iluvatar",
             priority=100,
         ),
+        OpImpl(
+            op_name="nvfp4_data_transpose",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_data_transpose, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swizzle_scales_for_gemm_",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swizzle_scales_for_gemm_, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="grouped_swizzle_for_gemm",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.grouped_swizzle_for_gemm, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_host_pointers_to_tensor",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_host_pointers_to_tensor, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_device_pointer_for_data_and_scales",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_device_pointer_for_data_and_scales, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="splits_to_offsets",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.splits_to_offsets, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         OpImpl(
             op_name="compute_amax",
             impl_id="vendor.iluvatar",
@@ -670,6 +774,104 @@ def register_builtins(registry) -> None:
             vendor="Iluvatar",
             priority=100,
         ),
+        # MXFP8 scaling operations
+        OpImpl(
+            op_name="mxfp8_scaling_compute_partial_amax",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_compute_partial_amax, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="mxfp8_scaling_partial_cast",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_partial_cast, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        # NVFP4 operations
+        OpImpl(
+            op_name="nvfp4_2d_compute_partial_amax",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_compute_partial_amax, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_compute_partial_amax",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_compute_partial_amax, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_global_scale",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_global_scale, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_per_block_scale",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_per_block_scale, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_expand_scale_to_fp8",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_expand_scale_to_fp8, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_fused_scale",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_fused_scale, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_fused_scale",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_fused_scale, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_partial_cast",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_partial_cast, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_2d_partial_cast",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_2d_partial_cast, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_multi_tensor_transpose",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_multi_tensor_transpose, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Padding operations
         OpImpl(
             op_name="fused_multi_row_padding",
@@ -819,6 +1021,14 @@ def register_builtins(registry) -> None:
             vendor="Iluvatar",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_scale_tensor",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale_tensor, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         OpImpl(
             op_name="multi_tensor_l2norm",
             impl_id="vendor.iluvatar",
@@ -891,6 +1101,14 @@ def register_builtins(registry) -> None:
             vendor="Iluvatar",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_inv_e8m0",
+            impl_id="vendor.iluvatar",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_inv_e8m0, is_avail),
+            vendor="Iluvatar",
+            priority=100,
+        ),
         # Communication overlap operations
         OpImpl(
             op_name="bulk_overlap_ag_with_external_gemm",
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 3c8663ff1e..9b1884102a 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -163,6 +163,42 @@ def bgrad_quantize(
 
         return tex.bgrad_quantize(input, quantizer)
 
+    def group_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
+        return tex.group_quantize(input, quantizer)
+
+    def bgrad_group_quantize(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> List[Any]:
+        tex = self._get_tex()
+
+        # Normalize quantizer.dtype to this backend's `tex.DType`.
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+
+        return tex.bgrad_group_quantize(input, quantizer)
+
     def generic_gemm(
         self,
         A: Any,
@@ -219,6 +255,10 @@ def generic_gemm(
         )
 
     # GELU and variants #
+    def glu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.glu(input, quantizer)
+
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.gelu(input, quantizer)
@@ -272,6 +312,10 @@ def clamped_swiglu(
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
 
     # Backward of GELU and variants #
+    def dglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dglu(grad, fwd_input, quantizer)
+
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
         return tex.dgelu(grad, fwd_input, quantizer)
@@ -565,9 +609,10 @@ def split_quantize(
         tensor: torch.Tensor,
         split_sections: List[int],
         quantizer_list: List[Any],
+        disable_bulk_allocation: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.split_quantize(tensor, split_sections, quantizer_list)
+        return tex.split_quantize(tensor, split_sections, quantizer_list, disable_bulk_allocation)
 
     def te_general_grouped_gemm(
         self,
@@ -612,6 +657,18 @@ def te_general_grouped_gemm(
             math_sm_count,
         )
 
+    def te_general_grouped_gemm_for_grouped_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_grouped_tensor(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_in(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_in(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_out(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_out(*args, **kwargs)
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -630,6 +687,30 @@ def swap_first_dims(
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
 
+    def nvfp4_data_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_data_transpose(*args, **kwargs)
+
+    def swizzle_scales_for_gemm_(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.swizzle_scales_for_gemm_(*args, **kwargs)
+
+    def grouped_swizzle_for_gemm(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.grouped_swizzle_for_gemm(*args, **kwargs)
+
+    def convert_host_pointers_to_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.convert_host_pointers_to_tensor(*args, **kwargs)
+
+    def get_device_pointer_for_data_and_scales(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.get_device_pointer_for_data_and_scales(*args, **kwargs)
+
+    def splits_to_offsets(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.splits_to_offsets(*args, **kwargs)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -738,6 +819,56 @@ def fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
 
+    # MXFP8 ops
+    def mxfp8_scaling_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_compute_partial_amax(*args, **kwargs)
+
+    def mxfp8_scaling_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_partial_cast(*args, **kwargs)
+
+    # NVFP4 ops
+    def nvfp4_2d_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_multi_tensor_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_compute_global_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_global_scale(*args, **kwargs)
+
+    def nvfp4_compute_per_block_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_per_block_scale(*args, **kwargs)
+
+    def nvfp4_expand_scale_to_fp8(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_expand_scale_to_fp8(*args, **kwargs)
+
+    def nvfp4_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_fused_scale(*args, **kwargs)
+
+    def nvfp4_multi_tensor_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_fused_scale(*args, **kwargs)
+
+    def nvfp4_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_multi_tensor_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_2d_multi_tensor_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_multi_tensor_transpose(*args, **kwargs)
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -1324,6 +1455,10 @@ def multi_tensor_scale(
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
 
+    def multi_tensor_scale_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.multi_tensor_scale_tensor(*args, **kwargs)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1542,6 +1677,10 @@ def multi_tensor_compute_scale_and_scale_inv(
             chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
+    def multi_tensor_compute_scale_inv_e8m0(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.multi_tensor_compute_scale_inv_e8m0(*args, **kwargs)
+
     # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
index fd6c0cdafd..cfe3a175ff 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/register_ops.py
@@ -105,6 +105,30 @@ def register_builtins(registry) -> None:
             vendor="METAX",
             priority=100,
         ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_grouped_tensor",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_grouped_tensor, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_in",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_in, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_out",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_out, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Quantization
         OpImpl(
             op_name="quantize",
@@ -130,6 +154,22 @@ def register_builtins(registry) -> None:
             vendor="METAX",
             priority=100,
         ),
+        OpImpl(
+            op_name="group_quantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.group_quantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_group_quantize",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_group_quantize, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         OpImpl(
             op_name="split_quantize",
             impl_id="vendor.metax",
@@ -139,6 +179,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Forward
+        OpImpl(
+            op_name="glu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.glu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         OpImpl(
             op_name="gelu",
             impl_id="vendor.metax",
@@ -228,6 +276,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Backward
+        OpImpl(
+            op_name="dglu",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dglu, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         OpImpl(
             op_name="dgelu",
             impl_id="vendor.metax",
@@ -638,6 +694,54 @@ def register_builtins(registry) -> None:
             vendor="METAX",
             priority=100,
         ),
+        OpImpl(
+            op_name="nvfp4_data_transpose",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_data_transpose, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swizzle_scales_for_gemm_",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swizzle_scales_for_gemm_, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="grouped_swizzle_for_gemm",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.grouped_swizzle_for_gemm, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_host_pointers_to_tensor",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_host_pointers_to_tensor, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_device_pointer_for_data_and_scales",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_device_pointer_for_data_and_scales, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="splits_to_offsets",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.splits_to_offsets, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         OpImpl(
             op_name="compute_amax",
             impl_id="vendor.metax",
@@ -670,6 +774,104 @@ def register_builtins(registry) -> None:
             vendor="METAX",
             priority=100,
         ),
+        # MXFP8 ops
+        OpImpl(
+            op_name="mxfp8_scaling_compute_partial_amax",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_compute_partial_amax, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="mxfp8_scaling_partial_cast",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_partial_cast, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        # NVFP4 ops
+        OpImpl(
+            op_name="nvfp4_2d_compute_partial_amax",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_compute_partial_amax, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_compute_partial_amax",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_compute_partial_amax, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_global_scale",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_global_scale, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_per_block_scale",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_per_block_scale, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_expand_scale_to_fp8",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_expand_scale_to_fp8, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_fused_scale",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_fused_scale, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_fused_scale",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_fused_scale, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_partial_cast",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_partial_cast, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_2d_partial_cast",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_2d_partial_cast, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_multi_tensor_transpose",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_multi_tensor_transpose, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Padding operations
         OpImpl(
             op_name="fused_multi_row_padding",
@@ -819,6 +1021,14 @@ def register_builtins(registry) -> None:
             vendor="METAX",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_scale_tensor",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale_tensor, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         OpImpl(
             op_name="multi_tensor_l2norm",
             impl_id="vendor.metax",
@@ -891,6 +1101,14 @@ def register_builtins(registry) -> None:
             vendor="METAX",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_inv_e8m0",
+            impl_id="vendor.metax",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_inv_e8m0, is_avail),
+            vendor="METAX",
+            priority=100,
+        ),
         # Communication overlap operations
         OpImpl(
             op_name="bulk_overlap_ag_with_external_gemm",
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/musa.py b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
index cba8c85a79..89962f3bc6 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/musa.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
@@ -175,6 +175,40 @@ def bgrad_quantize(
 
         return tex.bgrad_quantize(input, quantizer)
 
+    def group_quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        num_tensors: int,
+        first_dims: List[int],
+    ) -> Any:
+        tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+        return tex.group_quantize(tensor, quantizer, num_tensors, first_dims)
+
+    def bgrad_group_quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        num_tensors: int,
+        first_dims: List[int],
+    ) -> Any:
+        tex = self._get_tex()
+        try:
+            if quantizer is not None and hasattr(quantizer, "dtype") and hasattr(tex, "DType"):
+                qdtype = quantizer.dtype
+                if qdtype is not None:
+                    quantizer.dtype = tex.DType(int(qdtype))
+        except Exception:
+            pass
+        return tex.bgrad_group_quantize(tensor, quantizer, num_tensors, first_dims)
+
     def generic_gemm(
         self,
         A: Any,
@@ -230,6 +264,11 @@ def generic_gemm(
             beta,
         )
 
+    # GLU #
+    def glu(self, input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.glu(input, quantizer)
+
     # GELU and variants #
     def gelu(self, input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
@@ -283,6 +322,11 @@ def clamped_swiglu(
         tex = self._get_tex()
         return tex.clamped_swiglu(input, quantizer, limit, alpha)
 
+    # Backward of GLU #
+    def dglu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
+        tex = self._get_tex()
+        return tex.dglu(grad, fwd_input, quantizer)
+
     # Backward of GELU and variants #
     def dgelu(self, grad: torch.Tensor, fwd_input: torch.Tensor, quantizer: Any) -> Any:
         tex = self._get_tex()
@@ -577,9 +621,10 @@ def split_quantize(
         tensor: torch.Tensor,
         split_sections: List[int],
         quantizer_list: List[Any],
+        disable_bulk_allocation: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
-        return tex.split_quantize(tensor, split_sections, quantizer_list)
+        return tex.split_quantize(tensor, split_sections, quantizer_list, disable_bulk_allocation)
 
     def te_general_grouped_gemm(
         self,
@@ -624,6 +669,18 @@ def te_general_grouped_gemm(
             math_sm_count,
         )
 
+    def te_general_grouped_gemm_for_grouped_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_grouped_tensor(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_in(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_in(*args, **kwargs)
+
+    def te_general_grouped_gemm_for_discrete_out(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.te_general_grouped_gemm_for_discrete_out(*args, **kwargs)
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -642,6 +699,30 @@ def swap_first_dims(
         tex = self._get_tex()
         return tex.swap_first_dims(tensor, out)
 
+    def nvfp4_data_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_data_transpose(*args, **kwargs)
+
+    def swizzle_scales_for_gemm_(self, tensor: torch.Tensor) -> None:
+        tex = self._get_tex()
+        return tex.swizzle_scales_for_gemm_(tensor)
+
+    def grouped_swizzle_for_gemm(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.grouped_swizzle_for_gemm(*args, **kwargs)
+
+    def convert_host_pointers_to_tensor(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.convert_host_pointers_to_tensor(*args, **kwargs)
+
+    def get_device_pointer_for_data_and_scales(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.get_device_pointer_for_data_and_scales(*args, **kwargs)
+
+    def splits_to_offsets(self, first_dims, logical_last_dim):
+        tex = self._get_tex()
+        return tex.splits_to_offsets(first_dims, logical_last_dim)
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -750,6 +831,59 @@ def fp8_block_scaling_partial_cast(
             inp, out, scale, h, w, start_offset, block_len, out_dtype
         )
 
+    # MXFP8 scaling
+    def mxfp8_scaling_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.mxfp8_scaling_compute_partial_amax(*args, **kwargs)
+
+    def mxfp8_scaling_partial_cast(self, inp, out, scale, h, w, start_offset, block_len, out_dtype):
+        tex = self._get_tex()
+        out_dtype = tex.DType(int(out_dtype)) if out_dtype is not None else None
+        return tex.mxfp8_scaling_partial_cast(
+            inp, out, scale, h, w, start_offset, block_len, out_dtype
+        )
+
+    # NVFP4 2D
+    def nvfp4_2d_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_multi_tensor_compute_partial_amax(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_compute_partial_amax(*args, **kwargs)
+
+    def nvfp4_compute_global_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_global_scale(*args, **kwargs)
+
+    def nvfp4_compute_per_block_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_compute_per_block_scale(*args, **kwargs)
+
+    def nvfp4_expand_scale_to_fp8(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_expand_scale_to_fp8(*args, **kwargs)
+
+    def nvfp4_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_fused_scale(*args, **kwargs)
+
+    def nvfp4_multi_tensor_fused_scale(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_fused_scale(*args, **kwargs)
+
+    def nvfp4_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_multi_tensor_2d_partial_cast(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_multi_tensor_2d_partial_cast(*args, **kwargs)
+
+    def nvfp4_2d_multi_tensor_transpose(self, *args, **kwargs):
+        tex = self._get_tex()
+        return tex.nvfp4_2d_multi_tensor_transpose(*args, **kwargs)
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -1336,6 +1470,16 @@ def multi_tensor_scale(
         tex = self._get_tex()
         return tex.multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale)
 
+    def multi_tensor_scale_tensor(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_scale_tensor(chunk_size, noop_flag, tensor_lists, scale)
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1554,6 +1698,18 @@ def multi_tensor_compute_scale_and_scale_inv(
             chunk_size, noop_flag, tensor_lists, max_fp8, force_pow_2_scales, epsilon
         )
 
+    def multi_tensor_compute_scale_inv_e8m0(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        block_len: int,
+    ) -> None:
+        tex = self._get_tex()
+        return tex.multi_tensor_compute_scale_inv_e8m0(
+            chunk_size, noop_flag, tensor_lists, block_len
+        )
+
     # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py b/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py
index 7027188369..cb3e3b7d29 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/register_ops.py
@@ -105,6 +105,30 @@ def register_builtins(registry) -> None:
             vendor="MUSA",
             priority=100,
         ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_grouped_tensor",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_grouped_tensor, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_in",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_in, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="te_general_grouped_gemm_for_discrete_out",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.te_general_grouped_gemm_for_discrete_out, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         # Quantization
         OpImpl(
             op_name="quantize",
@@ -130,6 +154,22 @@ def register_builtins(registry) -> None:
             vendor="MUSA",
             priority=100,
         ),
+        OpImpl(
+            op_name="group_quantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.group_quantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="bgrad_group_quantize",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.bgrad_group_quantize, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         OpImpl(
             op_name="split_quantize",
             impl_id="vendor.musa",
@@ -139,6 +179,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Forward
+        OpImpl(
+            op_name="glu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.glu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         OpImpl(
             op_name="gelu",
             impl_id="vendor.musa",
@@ -228,6 +276,14 @@ def register_builtins(registry) -> None:
             priority=100,
         ),
         # Activations - Backward
+        OpImpl(
+            op_name="dglu",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.dglu, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         OpImpl(
             op_name="dgelu",
             impl_id="vendor.musa",
@@ -638,6 +694,54 @@ def register_builtins(registry) -> None:
             vendor="MUSA",
             priority=100,
         ),
+        OpImpl(
+            op_name="nvfp4_data_transpose",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_data_transpose, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="swizzle_scales_for_gemm_",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.swizzle_scales_for_gemm_, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="grouped_swizzle_for_gemm",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.grouped_swizzle_for_gemm, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="convert_host_pointers_to_tensor",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.convert_host_pointers_to_tensor, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="get_device_pointer_for_data_and_scales",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.get_device_pointer_for_data_and_scales, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="splits_to_offsets",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.splits_to_offsets, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         OpImpl(
             op_name="compute_amax",
             impl_id="vendor.musa",
@@ -670,6 +774,102 @@ def register_builtins(registry) -> None:
             vendor="MUSA",
             priority=100,
         ),
+        OpImpl(
+            op_name="mxfp8_scaling_compute_partial_amax",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_compute_partial_amax, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="mxfp8_scaling_partial_cast",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.mxfp8_scaling_partial_cast, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_compute_partial_amax",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_compute_partial_amax, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_compute_partial_amax",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_compute_partial_amax, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_global_scale",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_global_scale, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_compute_per_block_scale",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_compute_per_block_scale, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_expand_scale_to_fp8",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_expand_scale_to_fp8, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_fused_scale",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_fused_scale, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_fused_scale",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_fused_scale, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_partial_cast",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_partial_cast, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_multi_tensor_2d_partial_cast",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_multi_tensor_2d_partial_cast, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
+        OpImpl(
+            op_name="nvfp4_2d_multi_tensor_transpose",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.nvfp4_2d_multi_tensor_transpose, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         # Padding operations
         OpImpl(
             op_name="fused_multi_row_padding",
@@ -819,6 +1019,14 @@ def register_builtins(registry) -> None:
             vendor="MUSA",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_scale_tensor",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_scale_tensor, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         OpImpl(
             op_name="multi_tensor_l2norm",
             impl_id="vendor.musa",
@@ -891,6 +1099,14 @@ def register_builtins(registry) -> None:
             vendor="MUSA",
             priority=100,
         ),
+        OpImpl(
+            op_name="multi_tensor_compute_scale_inv_e8m0",
+            impl_id="vendor.musa",
+            kind=BackendImplKind.VENDOR,
+            fn=_bind_is_available(backend.multi_tensor_compute_scale_inv_e8m0, is_avail),
+            vendor="MUSA",
+            priority=100,
+        ),
         # Communication overlap operations
         OpImpl(
             op_name="bulk_overlap_ag_with_external_gemm",
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 7e39bef7a3..81fb91b75d 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -442,6 +442,14 @@ def generic_gemm(
     ) -> List[Any]:
         raise NotImplementedError
 
+    # GLU #
+    def glu(
+        self,
+        input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
     # GELU and variants #
     def gelu(
         self,
@@ -524,6 +532,15 @@ def clamped_swiglu(
     ) -> Any:
         raise NotImplementedError
 
+    # Backward of GLU #
+    def dglu(
+        self,
+        grad: torch.Tensor,
+        fwd_input: torch.Tensor,
+        quantizer: Any,
+    ) -> Any:
+        raise NotImplementedError
+
     # Backward of GELU and variants #
     def dgelu(
         self,
@@ -834,11 +851,30 @@ def multi_tensor_quantize(
     ) -> List[Any]:
         raise NotImplementedError
 
+    def group_quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        num_tensors: int,
+        first_dims: List[int],
+    ) -> Any:
+        raise NotImplementedError
+
+    def bgrad_group_quantize(
+        self,
+        tensor: torch.Tensor,
+        quantizer: Any,
+        num_tensors: int,
+        first_dims: List[int],
+    ) -> Any:
+        raise NotImplementedError
+
     def split_quantize(
         self,
         tensor: torch.Tensor,
         split_sections: List[int],
         quantizer_list: List[Any],
+        disable_bulk_allocation: bool = False,
     ) -> List[Any]:
         raise NotImplementedError
 
@@ -864,6 +900,27 @@ def te_general_grouped_gemm(
     ) -> Optional[List[torch.Tensor]]:
         raise NotImplementedError
 
+    def te_general_grouped_gemm_for_grouped_tensor(
+        self,
+        *args,
+        **kwargs,
+    ) -> Optional[List[torch.Tensor]]:
+        raise NotImplementedError
+
+    def te_general_grouped_gemm_for_discrete_in(
+        self,
+        *args,
+        **kwargs,
+    ) -> Optional[List[torch.Tensor]]:
+        raise NotImplementedError
+
+    def te_general_grouped_gemm_for_discrete_out(
+        self,
+        *args,
+        **kwargs,
+    ) -> Optional[List[torch.Tensor]]:
+        raise NotImplementedError
+
     def fp8_transpose(
         self,
         input: torch.Tensor,
@@ -879,6 +936,50 @@ def swap_first_dims(
     ) -> torch.Tensor:
         raise NotImplementedError
 
+    def nvfp4_data_transpose(
+        self,
+        input: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def swizzle_scales_for_gemm_(
+        self,
+        tensor: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    def grouped_swizzle_for_gemm(
+        self,
+        tensor: Any,
+        rowwise: bool,
+        columnwise: bool,
+    ) -> None:
+        raise NotImplementedError
+
+    def convert_host_pointers_to_tensor(
+        self,
+        tensor_lists: List[List[torch.Tensor]],
+    ) -> Any:
+        raise NotImplementedError
+
+    def get_device_pointer_for_data_and_scales(
+        self,
+        data_tensors: List[torch.Tensor],
+        scale_tensors: List[torch.Tensor],
+        swizzle: bool = False,
+        rowwise: bool = True,
+        data_dtype: Any = None,
+    ) -> Any:
+        raise NotImplementedError
+
+    def splits_to_offsets(
+        self,
+        first_dims: List[int],
+        logical_last_dim: int,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
     def get_fused_attn_backend(
         self,
         is_training: bool,
@@ -943,6 +1044,129 @@ def fp8_block_scaling_partial_cast(
     ) -> None:
         raise NotImplementedError
 
+    # MXFP8 scaling
+    def mxfp8_scaling_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+    ) -> None:
+        raise NotImplementedError
+
+    def mxfp8_scaling_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int,
+        out_dtype: DType,
+    ) -> None:
+        raise NotImplementedError
+
+    # NVFP4 2D
+    def nvfp4_2d_compute_partial_amax(
+        self,
+        tensor: torch.Tensor,
+        amax: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int = 16,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_multi_tensor_compute_partial_amax(
+        self,
+        master_weight_list: List[torch.Tensor],
+        partial_amax_list: List[torch.Tensor],
+        global_amax_list: List[torch.Tensor],
+        h_list: List[int],
+        w_list: List[int],
+        start_offset_list: List[int],
+        block_len: int = 16,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_compute_global_scale(
+        self,
+        global_amaxes: torch.Tensor,
+        global_scale_tensor: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_compute_per_block_scale(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_expand_scale_to_fp8(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_fused_scale(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_multi_tensor_fused_scale(
+        self,
+        block_amax_list: List[torch.Tensor],
+        global_amax_list: List[torch.Tensor],
+        per_block_scale_list: List[torch.Tensor],
+        target_scale_list: List[torch.Tensor],
+        target_amax_list: List[torch.Tensor],
+        tile_rows_list: List[int],
+        tile_cols_list: List[int],
+        rows_padded_list: List[int],
+        block_len: int,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_2d_partial_cast(
+        self,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        scale: torch.Tensor,
+        global_scale: torch.Tensor,
+        h: int,
+        w: int,
+        start_offset: int,
+        block_len: int = 16,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_multi_tensor_2d_partial_cast(
+        self,
+        inp_list: List[torch.Tensor],
+        *args,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def nvfp4_2d_multi_tensor_transpose(
+        self,
+        rowwise_data_list: List[torch.Tensor],
+        columnwise_data_list: List[torch.Tensor],
+        rowwise_scale_inv_list: List[torch.Tensor],
+        columnwise_scale_inv_list: List[torch.Tensor],
+        M_list: List[int],
+        K_list: List[int],
+    ) -> None:
+        raise NotImplementedError
+
     def fused_multi_row_padding(
         self,
         input: torch.Tensor,
@@ -1329,6 +1553,15 @@ def multi_tensor_scale(
     ) -> None:
         raise NotImplementedError
 
+    def multi_tensor_scale_tensor(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        scale: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError
+
     def multi_tensor_l2norm(
         self,
         chunk_size: int,
@@ -1458,6 +1691,15 @@ def multi_tensor_compute_scale_and_scale_inv(
     ) -> None:
         raise NotImplementedError
 
+    def multi_tensor_compute_scale_inv_e8m0(
+        self,
+        chunk_size: int,
+        noop_flag: torch.Tensor,
+        tensor_lists: List[List[torch.Tensor]],
+        block_len: int,
+    ) -> None:
+        raise NotImplementedError
+
     # Comm+GEMM Overlap
     def bulk_overlap_ag_with_external_gemm(
         self,
diff --git a/transformer_engine/pytorch/permutation.py b/transformer_engine/pytorch/permutation.py
index 2899b0e724..b103fc6992 100644
--- a/transformer_engine/pytorch/permutation.py
+++ b/transformer_engine/pytorch/permutation.py
@@ -45,9 +45,13 @@ def forward(
 
         # Device check
         if inp.device.type != te_device_type():
-            raise ValueError(f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}.")
+            raise ValueError(
+                f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}."
+            )
         if index.device.type != te_device_type():
-            raise ValueError(f"index must be a {te_device_type()} tensor, but got tensor on {index.device}.")
+            raise ValueError(
+                f"index must be a {te_device_type()} tensor, but got tensor on {index.device}."
+            )
         # Shape check
         if inp.size(0) != index.size(0):
             raise ValueError(
@@ -128,7 +132,9 @@ def forward(
         # None probs check
         if probs is not None:
             if probs.device.type != te_device_type():
-                raise ValueError(f"probs must be a {te_device_type()} tensor, but got tensor on {probs.device}.")
+                raise ValueError(
+                    f"probs must be a {te_device_type()} tensor, but got tensor on {probs.device}."
+                )
 
             if probs.dtype != torch.float32:
                 warnings.warn(
@@ -146,10 +152,13 @@ def forward(
 
         # Device check
         if inp.device.type != te_device_type():
-            raise ValueError(f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}.")
+            raise ValueError(
+                f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}."
+            )
         if row_id_map.device.type != te_device_type():
             raise ValueError(
-                f"row_id_map must be a {te_device_type()} tensor, but got tensor on {row_id_map.device}."
+                f"row_id_map must be a {te_device_type()} tensor, but got tensor on"
+                f" {row_id_map.device}."
             )
 
         # Data type check
@@ -212,18 +221,24 @@ def forward(
             return inp, torch.tensor([], device=inp.device), torch.tensor([], device=inp.device)
 
         if inp.device.type != te_device_type():
-            raise ValueError(f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}.")
+            raise ValueError(
+                f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}."
+            )
         if routing_map.device.type != te_device_type():
             raise ValueError(
-                f"routing_map must be a {te_device_type()} tensor, but got tensor on {routing_map.device}."
+                f"routing_map must be a {te_device_type()} tensor, but got tensor on"
+                f" {routing_map.device}."
             )
         if probs is not None:
             if probs.device.type != te_device_type():
-                raise ValueError(f"probs must be a {te_device_type()} tensor, but got tensor on {probs.device}.")
+                raise ValueError(
+                    f"probs must be a {te_device_type()} tensor, but got tensor on {probs.device}."
+                )
         if pad_offsets is not None:
             if pad_offsets.device.type != te_device_type():
                 raise ValueError(
-                    f"pad_offsets must be a {te_device_type()} tensor, but got tensor on {pad_offsets.device}."
+                    f"pad_offsets must be a {te_device_type()} tensor, but got tensor on"
+                    f" {pad_offsets.device}."
                 )
 
         if inp.size(0) != routing_map.size(0):
@@ -400,21 +415,26 @@ def forward(
         if with_probs:
             if merging_probs.device.type != te_device_type():
                 raise ValueError(
-                    "merging_probs must be a " + te_device_type() + " tensor, but got tensor on "
-                    f"{merging_probs.device}."
+                    "merging_probs must be a "
+                    + te_device_type()
+                    + f" tensor, but got tensor on {merging_probs.device}."
                 )
 
         # Device check
         if inp.device.type != te_device_type():
-            raise ValueError(f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}.")
+            raise ValueError(
+                f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}."
+            )
         if row_id_map.device.type != te_device_type():
             raise ValueError(
-                f"row_id_map must be a {te_device_type()} tensor, but got tensor on {row_id_map.device}."
+                f"row_id_map must be a {te_device_type()} tensor, but got tensor on"
+                f" {row_id_map.device}."
             )
         if pad_offsets is not None:
             if pad_offsets.device.type != te_device_type():
                 raise ValueError(
-                    f"pad_offsets must be a {te_device_type()} tensor, but got tensor on {pad_offsets.device}."
+                    f"pad_offsets must be a {te_device_type()} tensor, but got tensor on"
+                    f" {pad_offsets.device}."
                 )
 
         if isinstance(inp, QuantizedTensor):
@@ -780,18 +800,24 @@ def forward(
             return inp, probs
 
         if inp.device.type != te_device_type():
-            raise ValueError(f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}.")
+            raise ValueError(
+                f"inp must be a {te_device_type()} tensor, but got tensor on {inp.device}."
+            )
         if split_sizes.device.type != te_device_type():
             raise ValueError(
-                f"split_sizes must be a {te_device_type()} tensor, but got tensor on {split_sizes.device}."
+                f"split_sizes must be a {te_device_type()} tensor, but got tensor on"
+                f" {split_sizes.device}."
             )
         if sorted_idxs.device.type != te_device_type():
             raise ValueError(
-                f"sorted_idxs must be a {te_device_type()} tensor, but got tensor on {sorted_idxs.device}."
+                f"sorted_idxs must be a {te_device_type()} tensor, but got tensor on"
+                f" {sorted_idxs.device}."
             )
         if probs is not None:
             if probs.device.type != te_device_type():
-                raise ValueError(f"probs must be a {te_device_type()} tensor, but got tensor on {probs.device}.")
+                raise ValueError(
+                    f"probs must be a {te_device_type()} tensor, but got tensor on {probs.device}."
+                )
 
         num_tokens, hidden_size = inp.shape
         num_splits = split_sizes.size(0)
diff --git a/transformer_engine/pytorch/triton/permutation.py b/transformer_engine/pytorch/triton/permutation.py
index 70fbc73331..554879236c 100644
--- a/transformer_engine/pytorch/triton/permutation.py
+++ b/transformer_engine/pytorch/triton/permutation.py
@@ -166,7 +166,9 @@ def permute_with_mask_map(
     alloc = torch.zeros if pad_offsets is not None else torch.empty
     output = alloc((num_out_tokens, hidden_size), dtype=inp.dtype, device=te_device_type())
     permuted_probs = (
-        alloc((num_out_tokens,), dtype=probs.dtype, device=te_device_type()) if probs is not None else None
+        alloc((num_out_tokens,), dtype=probs.dtype, device=te_device_type())
+        if probs is not None
+        else None
     )
     permuted_scale = (
         alloc((num_out_tokens, scale_hidden_dim), dtype=scale.dtype, device=te_device_type())
@@ -329,7 +331,9 @@ def unpermute_with_mask_map_bwd_with_merging_probs(
     # by the kernel. This matches the behavior of Fp8Unpadding.backward which zeros
     # out the padding slots.
     alloc = torch.zeros if pad_offsets is not None else torch.empty
-    act_grad = alloc((num_out_tokens, hidden_size), dtype=fwd_output_grad.dtype, device=te_device_type())
+    act_grad = alloc(
+        (num_out_tokens, hidden_size), dtype=fwd_output_grad.dtype, device=te_device_type()
+    )
     merging_probs_grad = torch.empty(
         (num_tokens, num_experts), dtype=merging_probs.dtype, device=te_device_type()
     )

From 3230b42671b1a556fabba0e0a4c0a77b31d6db32 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Mon, 13 Apr 2026 15:31:41 +0800
Subject: [PATCH 45/59] patch: normalize new upstream 'cuda' string hardcoding
 to te_device_type()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scanned Python-layer diff (base..dev, excluding csrc) for newly introduced
hardcoded 'cuda' device strings. Replaced 11 instances across 7 files:
- device=torch.device('cuda') → device=torch.device(te_device_type()): 3
- device='cuda' → device=te_device_type(): 1
- .device.type == 'cuda' → .device.type == te_device_type(): 2
- get_autocast_dtype('cuda') → get_autocast_dtype(te_device_type()): 5
Skipped 10 intentional default parameter values and docstrings.
torch.cuda.* API calls left as-is (handled by vendor patches.py at runtime).
---
 transformer_engine/pytorch/cpu_offload.py                  | 5 +++--
 .../pytorch/custom_recipes/quantization_current_scaling.py | 3 ++-
 transformer_engine/pytorch/ops/basic/grouped_linear.py     | 3 ++-
 transformer_engine/pytorch/ops/basic/swiglu.py             | 7 ++++---
 .../pytorch/ops/fused/forward_grouped_mlp.py               | 3 ++-
 .../pytorch/tensor/float8_blockwise_tensor.py              | 2 +-
 transformer_engine/pytorch/tensor/grouped_tensor.py        | 3 ++-
 .../pytorch/tensor/storage/grouped_tensor_storage.py       | 3 ++-
 8 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/transformer_engine/pytorch/cpu_offload.py b/transformer_engine/pytorch/cpu_offload.py
index d0b314a64f..f1d1de64b1 100644
--- a/transformer_engine/pytorch/cpu_offload.py
+++ b/transformer_engine/pytorch/cpu_offload.py
@@ -16,6 +16,7 @@
 from transformer_engine.debug.pytorch.debug_state import TEDebugState
 import transformer_engine.pytorch as te
 import transformer_engine.pytorch.cpu_offload_v1 as v1_code_path
+from transformer_engine import te_device_type
 from .quantized_tensor import (
     restore_from_saved,
     prepare_for_saving,
@@ -345,7 +346,7 @@ def start_reload(self):
             # cannot move tensors from pool of one stream to another without
             # calling cudaFree and cudaMalloc again.
 
-            reloaded_tensor = torch.empty_like(tensor, device=torch.device("cuda"))
+            reloaded_tensor = torch.empty_like(tensor, device=torch.device(te_device_type()))
             self.offload_stream.wait_stream(torch.cuda.current_stream())
 
             with torch.cuda.stream(self.offload_stream):
@@ -461,7 +462,7 @@ def _check_if_offload(self, t: torch.Tensor) -> bool:
             not isinstance(t, torch.nn.Parameter)
             and not getattr(t, "_TE_do_not_offload", False)
             and not isinstance(t, torch._subclasses.FakeTensor)
-            and t.device.type == "cuda"
+            and t.device.type == te_device_type()
         ):
             if not t.is_contiguous() and not getattr(t, "offload_base_tensor", False):
                 warnings.warn(
diff --git a/transformer_engine/pytorch/custom_recipes/quantization_current_scaling.py b/transformer_engine/pytorch/custom_recipes/quantization_current_scaling.py
index 8580cf4a33..c11c0e34fa 100644
--- a/transformer_engine/pytorch/custom_recipes/quantization_current_scaling.py
+++ b/transformer_engine/pytorch/custom_recipes/quantization_current_scaling.py
@@ -10,6 +10,7 @@
 
 import torch
 
+from transformer_engine import te_device_type
 from transformer_engine.pytorch.custom_recipes import quantization
 from transformer_engine.pytorch.custom_recipes import utils
 from transformer_engine.pytorch.quantized_tensor import QuantizedTensorStorage, Quantizer
@@ -498,7 +499,7 @@ def make_empty(
 
         # Canonicalize tensor attributes
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         # Allocate quantized data
         qx = torch.empty(shape, dtype=self.dtype, device=device)
diff --git a/transformer_engine/pytorch/ops/basic/grouped_linear.py b/transformer_engine/pytorch/ops/basic/grouped_linear.py
index f26a337a4d..c9acbb7a53 100644
--- a/transformer_engine/pytorch/ops/basic/grouped_linear.py
+++ b/transformer_engine/pytorch/ops/basic/grouped_linear.py
@@ -13,6 +13,7 @@
 
 import torch
 
+from transformer_engine import te_device_type
 import transformer_engine_torch as tex
 from ...cpp_extensions import general_grouped_gemm
 from ...distributed import CudaRNGStatesTracker
@@ -690,7 +691,7 @@ def fuser_forward(
 
         # Get autocast dtype if needed
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = weight_param.dtype
 
diff --git a/transformer_engine/pytorch/ops/basic/swiglu.py b/transformer_engine/pytorch/ops/basic/swiglu.py
index b4427df41a..caecc03b30 100644
--- a/transformer_engine/pytorch/ops/basic/swiglu.py
+++ b/transformer_engine/pytorch/ops/basic/swiglu.py
@@ -10,6 +10,7 @@
 
 import torch
 
+from transformer_engine import te_device_type
 import transformer_engine_torch as tex
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...tensor import Float8CurrentScalingQuantizer, Quantizer
@@ -90,7 +91,7 @@ def op_forward(
         # Compute dtype
         dtype: torch.dtype
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = input_.dtype
         if dtype not in (torch.float32, torch.float16, torch.bfloat16):
@@ -242,7 +243,7 @@ def op_forward(
         # Compute dtype
         dtype: torch.dtype
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = input_.dtype
         if dtype not in (torch.float32, torch.float16, torch.bfloat16):
@@ -400,7 +401,7 @@ def fuser_forward(
 
         # Determine compute dtype
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         elif isinstance(input_, torch.Tensor):
             dtype = input_.dtype
         else:
diff --git a/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py b/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
index c5ce2b148d..c096d229af 100644
--- a/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
+++ b/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
@@ -13,6 +13,7 @@
 
 import torch
 
+from transformer_engine import te_device_type
 import transformer_engine_torch as tex
 from ...quantization import Recipe
 from ...tensor import Quantizer
@@ -157,7 +158,7 @@ def fuser_forward(
         fc2_weight_param = fc2_op.weight if fc2_op.single_grouped_weight else fc2_op.weight0
         device = fc1_weight_param.device
         if torch.is_autocast_enabled():
-            dtype = torch.get_autocast_dtype("cuda")
+            dtype = torch.get_autocast_dtype(te_device_type())
         else:
             dtype = fc1_weight_param.dtype
 
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
index 351ec1ed88..dfcae153e6 100644
--- a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -220,7 +220,7 @@ def make_empty(
             device = torch.device(te_device_type())
 
         tensor_kwargs = {
-            "device": torch.device("cuda") if device is None else device,
+            "device": torch.device(te_device_type()) if device is None else device,
             "pin_memory": pin_memory,
         }
 
diff --git a/transformer_engine/pytorch/tensor/grouped_tensor.py b/transformer_engine/pytorch/tensor/grouped_tensor.py
index ab0c7484fc..ffd179b6fe 100644
--- a/transformer_engine/pytorch/tensor/grouped_tensor.py
+++ b/transformer_engine/pytorch/tensor/grouped_tensor.py
@@ -12,6 +12,7 @@
 
 from ..quantized_tensor import QuantizedTensorStorage, Quantizer
 from .storage.grouped_tensor_storage import GroupedTensorStorage
+from transformer_engine import te_device_type
 
 
 def _stride_from_shape(shape: Tuple[int, ...]) -> Tuple[int, ...]:
@@ -128,7 +129,7 @@ def __new__(
                 device = maybe_tensor.device
                 break
         if device is None:
-            device = torch.device("cuda")
+            device = torch.device(te_device_type())
 
         # Match QuantizedTensor __new__: accept externally-computed stride to
         # avoid Python-side stride computation overhead for C++ construction.
diff --git a/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py
index ff1c78f695..93ed175989 100644
--- a/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py
+++ b/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py
@@ -18,6 +18,7 @@
 from .mxfp8_tensor_storage import MXFP8TensorStorage
 from .float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage
 from .nvfp4_tensor_storage import NVFP4TensorStorage
+from transformer_engine import te_device_type
 
 
 class GroupedTensorStorage:
@@ -563,7 +564,7 @@ def make_grouped_tensor(
             # TODO(ksivaman): Single kernel + remove the host offset calculation.
             tensor_offsets = GroupedTensorStorage.make_tensor_offsets(first_dims, logical_last_dim)
             if (
-                first_dims.device.type == "cuda"
+                first_dims.device.type == te_device_type()
                 and torch.cuda.is_available()
                 and torch.cuda.is_current_stream_capturing()
             ):

From 0ebf5258a8a6d5ae0018dcdb482ea9c65005f86b Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Mon, 13 Apr 2026 16:34:20 +0800
Subject: [PATCH 46/59] fix: update stale references in fork code to match
 upstream renames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scanned fork-specific code (new in merge vs dev) for references to
functions, classes, and file paths that upstream renamed or relocated
between base and dev. Fixed 6 stale reference(s):
- _load_cudnn() → _load_cuda_library("cudnn")
- _load_nvrtc() → _load_cuda_library("nvrtc")
- _load_curand() → _load_cuda_library("curand")
- _load_nvidia_cuda_library("cublas"/"cuda_runtime") → _load_cuda_library_from_python()
- tensor.quantized_tensor → quantized_tensor (pytorch/utils.py)
- tensor.quantized_tensor → quantized_tensor (flagos backends.py)
---
 transformer_engine/common/__init__.py         | 21 ++++++++++++++-----
 .../dot_product_attention/backends.py         |  2 +-
 transformer_engine/pytorch/utils.py           |  2 +-
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index bbe55151e8..fe933e0191 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -403,11 +403,22 @@ def _load_core_library():
 
     # Skip loading CUDA libraries if CUDA build was skipped (FL-only mode)
     if not skip_cuda_build():
-        _CUDNN_LIB_CTYPES = _load_cudnn()
-        _NVRTC_LIB_CTYPES = _load_nvrtc()
-        _CURAND_LIB_CTYPES = _load_curand()
-        _CUBLAS_LIB_CTYPES = _load_nvidia_cuda_library("cublas")
-        _CUDART_LIB_CTYPES = _load_nvidia_cuda_library("cuda_runtime")
+        # `_load_cuda_library` is used for packages that must be loaded
+        # during runtime. Both system and pypi packages are searched
+        # and an error is thrown if not found.
+        _, _CUDNN_LIB_CTYPES = _load_cuda_library("cudnn")
+        system_nvrtc, _NVRTC_LIB_CTYPES = _load_cuda_library("nvrtc")
+        system_curand, _CURAND_LIB_CTYPES = _load_cuda_library("curand")
+
+        # This additional step is necessary to be able to install TE wheels
+        # and import TE (without any guards) in an environment where the cuda
+        # toolkit might be absent without being guarded
+        load_libs_for_no_ctk = not system_nvrtc and not system_curand
+        if load_libs_for_no_ctk:
+            _CUBLAS_LIB_CTYPES = _load_cuda_library_from_python("cublas", strict=True)
+            _CUDART_LIB_CTYPES = _load_cuda_library_from_python("cudart", strict=True)
+            _CUDNN_ALL_LIB_CTYPES = _load_cuda_library_from_python("cudnn", strict=True)
+
         _TE_LIB_CTYPES = _load_core_library()
 
         # Needed to find the correct headers for NVRTC kernels.
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index f967dc54d8..490e2d3a90 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -15,7 +15,7 @@
 )
 from transformer_engine.pytorch.utils import nvtx_range_push, nvtx_range_pop
 
-from transformer_engine.pytorch.tensor.quantized_tensor import (
+from transformer_engine.pytorch.quantized_tensor import (
     prepare_for_saving,
     restore_from_saved,
 )
diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py
index 757463801f..4e3109cf89 100644
--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -15,7 +15,7 @@
 from transformer_engine import te_device_type
 
 from .torch_version import torch_version
-from .tensor.quantized_tensor import Quantizer
+from .quantized_tensor import Quantizer
 from ..debug.pytorch.debug_quantization import DebugQuantizedTensor
 
 

From 3c86a95b691e257a92cee5047af8d1b8e4593cb0 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Mon, 13 Apr 2026 22:01:53 +0800
Subject: [PATCH 47/59] plugin: sync plugin APIs with upstream csrc changes

Updated plugin OP API layer to match pytorch/csrc/ pybind changes
between base and dev branches. Changes applied to:
- ops.py base class (TEFLBackendBase): added cuda_graph, deterministic to get_fused_attn_backend
- ops.py FlashAttentionBase: added num_splits to forward/_forward_impl signatures
- All vendor FlashAttention subclasses (cuda, hygon, metax, musa, kunlunxin)
- All 5 vendor backends get_fused_attn_backend (cuda, iluvatar, metax, musa, hygon)
- Reference and flagos backends updated for both APIs
- Verified get_attention_backend/AttentionParams pass-through (no changes needed)
See /tmp/plugin_api_changes.log for details.
---
 .../flagos/attention/dot_product_attention/backends.py     | 1 +
 .../plugin/core/backends/reference/flash_attention.py      | 1 +
 .../plugin/core/backends/reference/reference.py            | 2 ++
 .../plugin/core/backends/vendor/cuda/cuda.py               | 4 ++++
 .../plugin/core/backends/vendor/cuda/flash_attention.py    | 2 ++
 .../plugin/core/backends/vendor/hygon/flash_attention.py   | 2 ++
 .../plugin/core/backends/vendor/hygon/hygon.py             | 4 ++++
 .../plugin/core/backends/vendor/iluvatar/iluvatar.py       | 4 ++++
 .../core/backends/vendor/kunlunxin/flash_attention.py      | 1 +
 .../plugin/core/backends/vendor/metax/flash_attention.py   | 2 ++
 .../plugin/core/backends/vendor/metax/metax.py             | 4 ++++
 .../plugin/core/backends/vendor/musa/flash_attention.py    | 2 ++
 .../plugin/core/backends/vendor/musa/musa.py               | 4 ++++
 transformer_engine/plugin/core/ops.py                      | 7 +++++++
 14 files changed, 40 insertions(+)

diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index 490e2d3a90..d466ad9ac8 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -278,6 +278,7 @@ def _forward_impl(
         inference_params: Optional[InferenceParams] = None,
         flash_attention_backend: Optional[PkgVersion] = PkgVersion("0"),
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         assert all(
             x.dtype in [torch.float16, torch.bfloat16] or isinstance(x, Float8Tensor)
diff --git a/transformer_engine/plugin/core/backends/reference/flash_attention.py b/transformer_engine/plugin/core/backends/reference/flash_attention.py
index 9a8b9e932b..10a730ac52 100644
--- a/transformer_engine/plugin/core/backends/reference/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/reference/flash_attention.py
@@ -223,6 +223,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         """Flash Attention implementation using PyTorch's scaled_dot_product_attention.
 
diff --git a/transformer_engine/plugin/core/backends/reference/reference.py b/transformer_engine/plugin/core/backends/reference/reference.py
index 9755d85373..7f52c41677 100644
--- a/transformer_engine/plugin/core/backends/reference/reference.py
+++ b/transformer_engine/plugin/core/backends/reference/reference.py
@@ -447,6 +447,8 @@ def get_fused_attn_backend(
         _window_size_left: int,
         _window_size_right: int,
         _return_max_logit: bool,
+        _cuda_graph: bool = False,
+        _deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         return NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index c9a3457902..88f2a554f6 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -797,6 +797,8 @@ def get_fused_attn_backend(
         window_size_left: int,
         window_size_right: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
+        deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         tex = self._get_tex()
 
@@ -829,6 +831,8 @@ def get_fused_attn_backend(
             window_size_left,
             window_size_right,
             return_max_logit,
+            cuda_graph,
+            deterministic,
         )
         return NVTE_Fused_Attn_Backend(result)
 
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
index 4137ce1b4c..23295e51a5 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/flash_attention.py
@@ -97,6 +97,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         # Ensure native flash attention is initialized
         self._ensure_native_flash_attn()
@@ -124,4 +125,5 @@ def _forward_impl(
             inference_params=inference_params,
             flash_attention_backend=flash_attention_backend,
             fp8_output=fp8_output,
+            num_splits=num_splits,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
index cad4a13f35..eb2fbd4584 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/flash_attention.py
@@ -97,6 +97,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         # Ensure native flash attention is initialized
         self._ensure_native_flash_attn()
@@ -124,4 +125,5 @@ def _forward_impl(
             inference_params=inference_params,
             flash_attention_backend=flash_attention_backend,
             fp8_output=fp8_output,
+            num_splits=num_splits,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index a08a2bf434..5171cbc0c0 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -751,6 +751,8 @@ def get_fused_attn_backend(
         window_size_left: int,
         window_size_right: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
+        deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         tex = self._get_tex()
 
@@ -783,6 +785,8 @@ def get_fused_attn_backend(
             window_size_left,
             window_size_right,
             return_max_logit,
+            cuda_graph,
+            deterministic,
         )
         return NVTE_Fused_Attn_Backend(result)
 
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index 305d9bb977..4f89cb386e 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -775,6 +775,8 @@ def get_fused_attn_backend(
         window_size_left: int,
         window_size_right: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
+        deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         tex = self._get_tex()
 
@@ -807,6 +809,8 @@ def get_fused_attn_backend(
             window_size_left,
             window_size_right,
             return_max_logit,
+            cuda_graph,
+            deterministic,
         )
         return NVTE_Fused_Attn_Backend(result)
 
diff --git a/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
index 7135566e95..9beb5403ed 100644
--- a/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/kunlunxin/flash_attention.py
@@ -211,6 +211,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         """Flash Attention implementation using PyTorch's scaled_dot_product_attention."""
         if fp8:
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
index 49fdf56dde..30d6c488ae 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/flash_attention.py
@@ -97,6 +97,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         # Ensure metax flash attention is initialized
         self._ensure_metax_flash_attn()
@@ -124,4 +125,5 @@ def _forward_impl(
             inference_params=inference_params,
             flash_attention_backend=flash_attention_backend,
             fp8_output=fp8_output,
+            num_splits=num_splits,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 9b1884102a..d21b7d315e 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -730,6 +730,8 @@ def get_fused_attn_backend(
         window_size_left: int,
         window_size_right: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
+        deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         tex = self._get_tex()
 
@@ -762,6 +764,8 @@ def get_fused_attn_backend(
             window_size_left,
             window_size_right,
             return_max_logit,
+            cuda_graph,
+            deterministic,
         )
         return NVTE_Fused_Attn_Backend(result)
 
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py b/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py
index 1ef37407d4..cd03e82414 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/flash_attention.py
@@ -97,6 +97,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         # Ensure musa flash attention is initialized
         self._ensure_musa_flash_attn()
@@ -124,4 +125,5 @@ def _forward_impl(
             inference_params=inference_params,
             flash_attention_backend=flash_attention_backend,
             fp8_output=fp8_output,
+            num_splits=num_splits,
         )
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/musa.py b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
index 89962f3bc6..c24d50ae76 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/musa.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
@@ -742,6 +742,8 @@ def get_fused_attn_backend(
         window_size_left: int,
         window_size_right: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
+        deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         tex = self._get_tex()
 
@@ -774,6 +776,8 @@ def get_fused_attn_backend(
             window_size_left,
             window_size_right,
             return_max_logit,
+            cuda_graph,
+            deterministic,
         )
         return NVTE_Fused_Attn_Backend(result)
 
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 81fb91b75d..3767f2ab33 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -253,6 +253,7 @@ def _forward_impl(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         """
         Actual forward implementation - subclasses must implement this.
@@ -285,6 +286,7 @@ def forward(
         inference_params: Optional[Any] = None,
         flash_attention_backend: Optional[Any] = None,
         fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
     ) -> torch.Tensor:
         """
         Forward pass with automatic fallback support and caching.
@@ -314,6 +316,7 @@ def forward(
                 inference_params=inference_params,
                 flash_attention_backend=flash_attention_backend,
                 fp8_output=fp8_output,
+                num_splits=num_splits,
             )
 
         def call_impl_fn(impl_class):
@@ -341,6 +344,7 @@ def call_impl_fn(impl_class):
                     inference_params=inference_params,
                     flash_attention_backend=flash_attention_backend,
                     fp8_output=fp8_output,
+                    num_splits=num_splits,
                 )
             else:
                 fallback_instance = impl_class(**self._init_params)
@@ -369,6 +373,7 @@ def call_impl_fn(impl_class):
                     inference_params=inference_params,
                     flash_attention_backend=flash_attention_backend,
                     fp8_output=fp8_output,
+                    num_splits=num_splits,
                 )
 
         return self._manager.call_with_custom_impl(
@@ -999,6 +1004,8 @@ def get_fused_attn_backend(
         window_size_left: int,
         window_size_right: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
+        deterministic: bool = False,
     ) -> NVTE_Fused_Attn_Backend:
         raise NotImplementedError
 

From cc03ca31618e57143cb01a0b5c2a4f5111d1a209 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Tue, 14 Apr 2026 17:10:03 +0800
Subject: [PATCH 48/59] fix(stage9): add bottom_right_diagonal and cuda_graph
 params to fused_attn_fwd/bwd

Found during batch validation combo 2/9
(te_fl_prefer=vendor, attention_backend=fused, attempt 1).
Error: CUDABackend.fused_attn_fwd() takes 29 positional arguments but 31 were given
Root cause: upstream merge added bottom_right_diagonal and cuda_graph params to the
caller (cpp_extensions/fused_attn.py) but the plugin backend signatures were not updated.
Fix: added both params to ops.py base class, CUDA backend, and all vendor backends
(musa, iluvatar, hygon, metax) for both fused_attn_fwd and fused_attn_bwd.
---
 flagscale_validation_logs/batch_results.json         | 12 ++++++++++++
 .../plugin/core/backends/vendor/cuda/cuda.py         |  8 ++++++++
 .../plugin/core/backends/vendor/hygon/hygon.py       |  8 ++++++++
 .../plugin/core/backends/vendor/iluvatar/iluvatar.py |  8 ++++++++
 .../plugin/core/backends/vendor/metax/metax.py       |  8 ++++++++
 .../plugin/core/backends/vendor/musa/musa.py         |  8 ++++++++
 transformer_engine/plugin/core/ops.py                |  4 ++++
 7 files changed, 56 insertions(+)
 create mode 100644 flagscale_validation_logs/batch_results.json

diff --git a/flagscale_validation_logs/batch_results.json b/flagscale_validation_logs/batch_results.json
new file mode 100644
index 0000000000..7f847e8bbf
--- /dev/null
+++ b/flagscale_validation_logs/batch_results.json
@@ -0,0 +1,12 @@
+[
+  {
+    "combo_index": 1,
+    "te_fl_prefer": "vendor",
+    "attention_backend": "flash",
+    "status": "PASSED",
+    "final_loss": "1.202661E+01",
+    "steps_completed": 20,
+    "attempts": 1,
+    "fallback_errors": 0
+  }
+]
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 88f2a554f6..0386a4adb1 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -1088,6 +1088,7 @@ def fused_attn_fwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
         Q: Any,
@@ -1105,6 +1106,7 @@ def fused_attn_fwd(
         rng_gen: Optional[torch.Generator],
         rng_elts_per_thread: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1129,6 +1131,7 @@ def fused_attn_fwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             cu_seqlens_q,
             cu_seqlens_kv,
             Q,
@@ -1146,6 +1149,7 @@ def fused_attn_fwd(
             rng_gen,
             rng_elts_per_thread,
             return_max_logit,
+            cuda_graph,
         )
 
     def fused_attn_bwd(
@@ -1160,6 +1164,7 @@ def fused_attn_bwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         deterministic: bool,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
@@ -1176,6 +1181,7 @@ def fused_attn_bwd(
         s_quantizer: Any,
         dp_quantizer: Any,
         dqkv_quantizer: Any,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1200,6 +1206,7 @@ def fused_attn_bwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             deterministic,
             cu_seqlens_q,
             cu_seqlens_kv,
@@ -1216,6 +1223,7 @@ def fused_attn_bwd(
             s_quantizer,
             dp_quantizer,
             dqkv_quantizer,
+            cuda_graph,
         )
 
     def copy_to_kv_cache(
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index 5171cbc0c0..5d9e9779ee 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -942,6 +942,7 @@ def fused_attn_fwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
         Q: Any,
@@ -959,6 +960,7 @@ def fused_attn_fwd(
         rng_gen: Optional[torch.Generator],
         rng_elts_per_thread: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -983,6 +985,7 @@ def fused_attn_fwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             cu_seqlens_q,
             cu_seqlens_kv,
             Q,
@@ -1000,6 +1003,7 @@ def fused_attn_fwd(
             rng_gen,
             rng_elts_per_thread,
             return_max_logit,
+            cuda_graph,
         )
 
     def fused_attn_bwd(
@@ -1014,6 +1018,7 @@ def fused_attn_bwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         deterministic: bool,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
@@ -1030,6 +1035,7 @@ def fused_attn_bwd(
         s_quantizer: Any,
         dp_quantizer: Any,
         dqkv_quantizer: Any,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1054,6 +1060,7 @@ def fused_attn_bwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             deterministic,
             cu_seqlens_q,
             cu_seqlens_kv,
@@ -1070,6 +1077,7 @@ def fused_attn_bwd(
             s_quantizer,
             dp_quantizer,
             dqkv_quantizer,
+            cuda_graph,
         )
 
     def copy_to_kv_cache(
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index 4f89cb386e..79b891f5b9 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -966,6 +966,7 @@ def fused_attn_fwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
         Q: Any,
@@ -983,6 +984,7 @@ def fused_attn_fwd(
         rng_gen: Optional[torch.Generator],
         rng_elts_per_thread: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1007,6 +1009,7 @@ def fused_attn_fwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             cu_seqlens_q,
             cu_seqlens_kv,
             Q,
@@ -1024,6 +1027,7 @@ def fused_attn_fwd(
             rng_gen,
             rng_elts_per_thread,
             return_max_logit,
+            cuda_graph,
         )
 
     def fused_attn_bwd(
@@ -1038,6 +1042,7 @@ def fused_attn_bwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         deterministic: bool,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
@@ -1054,6 +1059,7 @@ def fused_attn_bwd(
         s_quantizer: Any,
         dp_quantizer: Any,
         dqkv_quantizer: Any,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1078,6 +1084,7 @@ def fused_attn_bwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             deterministic,
             cu_seqlens_q,
             cu_seqlens_kv,
@@ -1094,6 +1101,7 @@ def fused_attn_bwd(
             s_quantizer,
             dp_quantizer,
             dqkv_quantizer,
+            cuda_graph,
         )
 
     def copy_to_kv_cache(
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index d21b7d315e..725899c72b 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -923,6 +923,7 @@ def fused_attn_fwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
         Q: Any,
@@ -940,6 +941,7 @@ def fused_attn_fwd(
         rng_gen: Optional[torch.Generator],
         rng_elts_per_thread: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -964,6 +966,7 @@ def fused_attn_fwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             cu_seqlens_q,
             cu_seqlens_kv,
             Q,
@@ -981,6 +984,7 @@ def fused_attn_fwd(
             rng_gen,
             rng_elts_per_thread,
             return_max_logit,
+            cuda_graph,
         )
 
     def fused_attn_bwd(
@@ -995,6 +999,7 @@ def fused_attn_bwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         deterministic: bool,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
@@ -1011,6 +1016,7 @@ def fused_attn_bwd(
         s_quantizer: Any,
         dp_quantizer: Any,
         dqkv_quantizer: Any,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1035,6 +1041,7 @@ def fused_attn_bwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             deterministic,
             cu_seqlens_q,
             cu_seqlens_kv,
@@ -1051,6 +1058,7 @@ def fused_attn_bwd(
             s_quantizer,
             dp_quantizer,
             dqkv_quantizer,
+            cuda_graph,
         )
 
     def copy_to_kv_cache(
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/musa.py b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
index c24d50ae76..77a374ad97 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/musa.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
@@ -938,6 +938,7 @@ def fused_attn_fwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
         Q: Any,
@@ -955,6 +956,7 @@ def fused_attn_fwd(
         rng_gen: Optional[torch.Generator],
         rng_elts_per_thread: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -979,6 +981,7 @@ def fused_attn_fwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             cu_seqlens_q,
             cu_seqlens_kv,
             Q,
@@ -996,6 +999,7 @@ def fused_attn_fwd(
             rng_gen,
             rng_elts_per_thread,
             return_max_logit,
+            cuda_graph,
         )
 
     def fused_attn_bwd(
@@ -1010,6 +1014,7 @@ def fused_attn_bwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         deterministic: bool,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
@@ -1026,6 +1031,7 @@ def fused_attn_bwd(
         s_quantizer: Any,
         dp_quantizer: Any,
         dqkv_quantizer: Any,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         tex = self._get_tex()
 
@@ -1050,6 +1056,7 @@ def fused_attn_bwd(
             attn_mask_type,
             softmax_type,
             window_size,
+            bottom_right_diagonal,
             deterministic,
             cu_seqlens_q,
             cu_seqlens_kv,
@@ -1066,6 +1073,7 @@ def fused_attn_bwd(
             s_quantizer,
             dp_quantizer,
             dqkv_quantizer,
+            cuda_graph,
         )
 
     def copy_to_kv_cache(
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index 3767f2ab33..c3a9f8f176 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -1220,6 +1220,7 @@ def fused_attn_fwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
         Q: Any,
@@ -1237,6 +1238,7 @@ def fused_attn_fwd(
         rng_gen: Optional[torch.Generator],
         rng_elts_per_thread: int,
         return_max_logit: bool,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         raise NotImplementedError
 
@@ -1252,6 +1254,7 @@ def fused_attn_bwd(
         attn_mask_type: NVTE_Mask_Type,
         softmax_type: NVTE_Softmax_Type,
         window_size: List[int],
+        bottom_right_diagonal: Optional[bool],
         deterministic: bool,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_kv: torch.Tensor,
@@ -1268,6 +1271,7 @@ def fused_attn_bwd(
         s_quantizer: Any,
         dp_quantizer: Any,
         dqkv_quantizer: Any,
+        cuda_graph: bool = False,
     ) -> List[Any]:
         raise NotImplementedError
 

From 4db46cee744aaf274d4053315cca1d70923f561b Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Tue, 14 Apr 2026 17:42:17 +0800
Subject: [PATCH 49/59] fix(stage9): replace stale CPUOffloadEnabled with
 is_cpu_offload_enabled() in flagos backend

Found during batch validation combo 4/9
(te_fl_prefer=flagos, attention_backend=flash, attempt 1).
Error: Cached implementation 'default.flagos' failed for op 'get_flash_attention_class':
cannot import name 'CPUOffloadEnabled' from 'transformer_engine.pytorch.cpu_offload'
Root cause: upstream removed CPUOffloadEnabled from cpu_offload.py (v2 API),
replacing it with is_cpu_offload_enabled() function.
Fix: updated flagos backend to use the new function.
---
 flagscale_validation_logs/batch_results.json  | 20 +++++++++++++++++++
 .../dot_product_attention/backends.py         |  4 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/flagscale_validation_logs/batch_results.json b/flagscale_validation_logs/batch_results.json
index 7f847e8bbf..19321fd655 100644
--- a/flagscale_validation_logs/batch_results.json
+++ b/flagscale_validation_logs/batch_results.json
@@ -8,5 +8,25 @@
     "steps_completed": 20,
     "attempts": 1,
     "fallback_errors": 0
+  },
+  {
+    "combo_index": 2,
+    "te_fl_prefer": "vendor",
+    "attention_backend": "fused",
+    "status": "PASSED",
+    "final_loss": "1.230189E+01",
+    "steps_completed": 20,
+    "attempts": 2,
+    "fallback_errors": 0
+  },
+  {
+    "combo_index": 3,
+    "te_fl_prefer": "vendor",
+    "attention_backend": "unfused",
+    "status": "PASSED",
+    "final_loss": "1.669888E+01",
+    "steps_completed": 20,
+    "attempts": 1,
+    "fallback_errors": 0
   }
 ]
\ No newline at end of file
diff --git a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
index d466ad9ac8..1b0e72b6f7 100644
--- a/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
+++ b/transformer_engine/plugin/core/backends/flagos/attention/dot_product_attention/backends.py
@@ -99,11 +99,11 @@ def forward(
         ctx.nominal_dtype = out_nominal_dtype
 
         from transformer_engine.pytorch.cpu_offload import (
-            CPUOffloadEnabled,
+            is_cpu_offload_enabled,
             mark_activation_offload,
         )
 
-        if CPUOffloadEnabled:
+        if is_cpu_offload_enabled():
             tensor_list = [q, k, v, out]
 
             mark_activation_offload(*tensor_list)

From 8fa8199ad46341ca9b039d67dbad72171085aa59 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Wed, 15 Apr 2026 10:12:04 +0800
Subject: [PATCH 50/59] Final Polish

---
 3rdparty/cutlass                             |  2 +-
 3rdparty/googletest                          |  2 +-
 SECURITY.md                                  |  2 +-
 SYNC_POINT.md                                |  2 +-
 flagscale_validation_logs/batch_results.json | 32 --------------------
 5 files changed, 4 insertions(+), 36 deletions(-)
 delete mode 100644 flagscale_validation_logs/batch_results.json

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 73c59c055c..57e3cfb47a 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 73c59c055c0fec87792470dbf33325158113db5e
+Subproject commit 57e3cfb47a2d9e0d46eb6335c3dc411498efa198
diff --git a/3rdparty/googletest b/3rdparty/googletest
index a35bc7693c..f8d7d77c06 160000
--- a/3rdparty/googletest
+++ b/3rdparty/googletest
@@ -1 +1 @@
-Subproject commit a35bc7693c117a048152beeb34f6aac354b9423f
+Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
diff --git a/SECURITY.md b/SECURITY.md
index 7a6de0d126..35edb61b01 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -20,5 +20,5 @@ To report a potential security vulnerability in any NVIDIA product:
 While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information.
 
 ## NVIDIA Product Security
-## test
+
 For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
diff --git a/SYNC_POINT.md b/SYNC_POINT.md
index c321233330..7e7cfce0d5 100644
--- a/SYNC_POINT.md
+++ b/SYNC_POINT.md
@@ -2,5 +2,5 @@
 - Upstream: Nvidia/TransformerEngine
 - Branch: release_v2.14
 - Commit SHA: f031cf87bd054c7558b887df7bed93975456667f
-- Sync Date: 2025-07-17
+- Sync Date: 2026-04-10
 - Synced By: lixianduo
diff --git a/flagscale_validation_logs/batch_results.json b/flagscale_validation_logs/batch_results.json
deleted file mode 100644
index 19321fd655..0000000000
--- a/flagscale_validation_logs/batch_results.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-  {
-    "combo_index": 1,
-    "te_fl_prefer": "vendor",
-    "attention_backend": "flash",
-    "status": "PASSED",
-    "final_loss": "1.202661E+01",
-    "steps_completed": 20,
-    "attempts": 1,
-    "fallback_errors": 0
-  },
-  {
-    "combo_index": 2,
-    "te_fl_prefer": "vendor",
-    "attention_backend": "fused",
-    "status": "PASSED",
-    "final_loss": "1.230189E+01",
-    "steps_completed": 20,
-    "attempts": 2,
-    "fallback_errors": 0
-  },
-  {
-    "combo_index": 3,
-    "te_fl_prefer": "vendor",
-    "attention_backend": "unfused",
-    "status": "PASSED",
-    "final_loss": "1.669888E+01",
-    "steps_completed": 20,
-    "attempts": 1,
-    "fallback_errors": 0
-  }
-]
\ No newline at end of file

From d7e9e7ba67c3deaef286bb645d7e0316beeea573 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Fri, 24 Apr 2026 18:04:58 +0800
Subject: [PATCH 51/59] [CICD] Refactor workflows, Add integration_tests,
 Switch to FlagCICD metax runner (#60)

## Description

Refactors CI/CD workflows to support both CUDA (NVIDIA A100) and Metax
(C500) platforms, removes obsolete workflows, and fixes several
platform-specific test failures. Add functional testing, and log
reporting, with significant workflow simplification, and Metax platform
use BAAI runner configs.

---

## Type of change

- [x] New feature (non-breaking change which adds functionality)
- [x] Infra/Build change (changes to CI/CD workflows or build scripts)
- [x] Code refactoring
- [x] Bug fix
- [ ] Documentation change
- [ ] Breaking change

---

### Changes

- **Workflow cleanup**: Removed 7 obsolete workflows; extracted lint
into a standalone reusable `lint_common.yml` (runs in parallel); add
`integration_tests_common.yml`
- **Platform refactoring**: Added per-platform setup scripts
(`setup_cuda.sh` / `setup_metax.sh`); switched Metax config to BAAI
online environment; removed unsupported test types (JAX distributed)
from Metax matrix
- **Bug fixes**:
- Metax: skip incompatible distributed test files (`test_numerics`,
`test_torch_fsdp2`, etc.) to prevent `torchrun` SIGSEGV
- Metax: replace `nvidia-smi`-only FP8 detection with platform-aware
check
- CUDA: fix `libcudart` load failure when runtime is pip-installed (add
proper fallback chain in `_load_cudart()` and `try_load_lib`)

---

## Checklist

- [x] I have read and followed the contributing guidelines
- [x] The functionality is complete
- [x] I have commented my code, particularly in CI workflow setup steps
- [x] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added/updated tests that prove my feature works on CUDA and
Metax platform
- [x] New and existing unit tests pass locally on CUDA and Metax
platform

---------

Co-authored-by: qqjxzxq <1376782660@qq.com>
Co-authored-by: HermiaHuan <3081497279@qq.com>
---
 .github/configs/cuda.yml                      |  25 +-
 .github/configs/metax.yml                     |  47 ++--
 .github/scripts/setup_cuda.sh                 |  25 ++
 .github/scripts/setup_metax.sh                |  50 ++++
 .github/workflows/all_tests_common.yml        | 123 ++++++----
 .github/workflows/all_tests_cuda.yml          |   2 +
 .github/workflows/all_tests_metax.yml         |   9 +-
 .github/workflows/build.yml                   |  47 +++-
 .github/workflows/functional_tests_common.yml | 190 ---------------
 .../workflows/integration_tests_common.yml    | 134 +++++++++++
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |  18 +-
 .../workflows/qa-l1-te-cpp-pytorch-tests.yml  |  51 ++--
 .../qa-l3-te-pytorch-fa-versions-test.yml     |  13 +-
 .github/workflows/te-plugin-tests.yml         |   4 +-
 .github/workflows/unit_tests_common.yml       | 220 +++---------------
 3rdparty/cudnn-frontend                       |   2 +-
 3rdparty/googletest                           |   2 +-
 qa/L0_pytorch_debug_unittest/README.rst       |  26 +++
 qa/L0_pytorch_debug_unittest/test.sh          |  38 +--
 qa/L0_pytorch_unittest/test.sh                |   2 -
 qa/L1_pytorch_distributed_unittest/test.sh    | 125 +++++++++-
 qa/L1_pytorch_mcore_integration/test.sh       | 150 +++++++++---
 qa/L1_pytorch_mcore_integration/test_bak.sh   |  79 +++++++
 .../plugin/core/backends/vendor/cuda/cuda.py  |  39 +++-
 24 files changed, 838 insertions(+), 583 deletions(-)
 create mode 100755 .github/scripts/setup_cuda.sh
 create mode 100755 .github/scripts/setup_metax.sh
 delete mode 100644 .github/workflows/functional_tests_common.yml
 create mode 100644 .github/workflows/integration_tests_common.yml
 create mode 100644 qa/L0_pytorch_debug_unittest/README.rst
 create mode 100644 qa/L1_pytorch_mcore_integration/test_bak.sh

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 6975fab589..1c77fe6c25 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -1,26 +1,28 @@
 # CUDA Hardware Configuration for TransformerEngine-FL
-# Refactored for BAAI DGX A100 Nodes
+# Refactored for A100 Nodes
 # This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: cuda
 display_name: 'NVIDIA CUDA (A100)'
 
+# CI image for online env
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
 # runner_labels:
-# - self-hosted
-# - Linux
-# - X64
-# - nvidia
-# - gpu-8
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - nvidia
+#   - gpu-8
+
+# Runner labels for online env
 runner_labels:
   - nv-8g-cicd-te
 
 # Container volumes
 container_volumes:
   - /home/flagscale_cicd/flask/static:/workspace/report
-  # - /home/flagscale_cicd/data:/opt/data
 
 # Container options
 container_options: >-
@@ -32,9 +34,8 @@ container_options: >-
   --ulimit stack=67108864 
   --user root
 
-# Device types
-device_types:
-  - a100
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_cuda.sh
 
 # Build environment variables (platform-specific)
 build_env:
@@ -47,6 +48,10 @@ build_env:
   CUDA_HOME: /usr/local/cuda-12.8
   NVCC: /usr/local/cuda-12.8/bin/nvcc
 
+# Device types to run tests on
+device_types:
+  - a100
+
 # Test matrix configuration
 test_matrix:
   l0_pytorch:
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
index e3b10c892d..00b4e1df34 100644
--- a/.github/configs/metax.yml
+++ b/.github/configs/metax.yml
@@ -1,28 +1,33 @@
 # Metax Hardware Configuration for TE-FL
 # This file defines CI/CD settings for Metax-based testing
-# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+# This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: metax
 display_name: 'Metax Tests'
 
-ci_image: localhost:5000/megatron-lm-with-te:v1
-# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+# CI image for Metax dev env
+# ci_image: localhost:5000/megatron-lm-with-te:v1
 
-runner_labels:
-  - self-hosted
-  - Linux
-  - X64
-  - metax
-  - dev
+# CI image for online env
+ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+
+# Runner labels for self-hosted Metax node
 # runner_labels:
-#   - mx-4g-cicd-te
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - metax
+#   - dev
+
+# Runner labels for online env
+runner_labels:
+  - mx-4g-cicd-te
 
+# Container volumes
 container_volumes:
   - /nfs/metax_fs:/nfs/metax_fs
-  - /dev/dri:/dev/dri
-  - /dev/mxcd:/dev/mxcd
-  - /dev/infiniband:/dev/infiniband
 
+# Container options
 container_options: >-
   --uts=host
   --ipc=host
@@ -30,17 +35,16 @@ container_options: >-
   --group-add video
   --shm-size=100gb
   --ulimit memlock=-1
-  --security-opt seccomp=unconfined
-  --security-opt apparmor=unconfined
-  --device=/dev/dri
-  --device=/dev/mxcd
-  --device=/dev/infiniband
   --user root
   --ulimit nofile=65535:65535
   -e PLATFORM=metax
   -e TORCH_DISTRIBUTED_BACKEND=mccl
   -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
 
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_metax.sh
+
+# Build environment variables (platform-specific)
 build_env:
   TE_FL_SKIP_CUDA: '1'
   NVTE_WITH_MACA: '1'
@@ -62,10 +66,3 @@ test_matrix:
       # example: tests/unit_tests/test_example.py
       # - tests/unit_tests/test_inference.py
       # - tests/unit_tests/test_rl_utils.py
-
-  # functional:
-  #   train:
-  #     - device: c500
-  #       task: train
-  #       model: deepseek
-  #       case: tp2_pp2_ep2
diff --git a/.github/scripts/setup_cuda.sh b/.github/scripts/setup_cuda.sh
new file mode 100755
index 0000000000..f9e289c6d0
--- /dev/null
+++ b/.github/scripts/setup_cuda.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# CUDA Platform Environment Setup Script
+# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/miniconda3/etc/profile.d/conda.sh
+conda activate flagscale-train
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Remove Existing TransformerEngine ====="
+pip uninstall transformer_engine transformer_engine_torch -y || true
+
+echo "===== Step 2: Build & Install TransformerEngine ====="
+cd $GITHUB_WORKSPACE
+
+pip install nvdlfw-inspect --quiet
+pip install expecttest --quiet
+pip install . -v --no-deps --no-build-isolation
+
+echo "===== Step 3: Verify Installation ====="
+python3 tests/pytorch/test_sanity_import.py
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
new file mode 100755
index 0000000000..a2d0b0a4cf
--- /dev/null
+++ b/.github/scripts/setup_metax.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Metax Platform Environment Setup Script
+# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/conda/etc/profile.d/conda.sh
+conda activate base
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Base Environment Setup ====="
+# Configure MACA toolchain paths
+export PATH=/opt/maca/bin:$PATH
+export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+service ssh restart
+
+echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+# TransformerEngine expects nvcc, but MACA provides cucc
+ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+which nvcc || true
+
+echo "===== Step 3: Install Required System Tools ====="
+# Use apt to install git, curl
+sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
+apt-get update -qq || true
+apt-get install -y -qq git curl
+# Install cmake and ninja via pip (more reliable than apt in this env)
+python3 -m pip install cmake ninja torch --no-cache-dir
+
+echo "===== Step 4: Remove Existing TransformerEngine ====="
+# Prevent conflicts with preinstalled or incompatible versions
+python3 -m pip uninstall transformer_engine -y || true
+python3 -m pip install nvdlfw-inspect --no-deps || true
+
+echo "===== Step 5: Install TE-FL Plugin Layer ====="
+# Install TransformerEngine-FL Python layer (plugin logic)
+cd $GITHUB_WORKSPACE
+TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+echo "===== Step 6: Final Verification ====="
+# Verify both TE Python API and backend are functional
+python3 - <<'EOF'
+import transformer_engine
+import transformer_engine_torch as te
+print("transformer_engine:", transformer_engine)
+print("transformer_engine_torch:", te)
+EOF
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 2165de9b49..606a0d3e86 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -7,13 +7,20 @@ on:
         required: true
         type: string
         description: Platform name (e.g., cuda, default)
-      setup_commands:
+      run_unit_tests:
         required: false
-        type: string
-        default: ''
+        type: boolean
+        default: true
+        description: Whether to run unit tests in this workflow
+      run_integration_tests:
+        required: false
+        type: boolean
+        default: true
+        description: Whether to run integration tests in this workflow
 
 jobs:
   checkout_and_config:
+    name: checkout_and_config
     defaults:
       run:
         shell: bash
@@ -24,19 +31,12 @@ jobs:
       container_volumes: ${{ steps.config.outputs.container_volumes }}
       container_options: ${{ steps.config.outputs.container_options }}
       device_types: ${{ steps.config.outputs.device_types }}
-      train_test_matrix: ${{ steps.config.outputs.train_test_matrix }}
-      ignored_tests: ${{ steps.config.outputs.ignored_tests }}
+      setup_script: ${{ steps.config.outputs.setup_script }}
       build_env: ${{ steps.config.outputs.build_env }}
     steps:
       - name: Checkout source code
         uses: actions/checkout@v4
 
-      - name: Check if tests should run
-        id: should_run
-        run: |
-         
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       - name: Load platform configuration
         id: config
         run: |
@@ -71,26 +71,24 @@ jobs:
           DEVICE_TYPES=$(yq '.device_types | tojson(0)' "$CONFIG_FILE")
           echo "device_types=$DEVICE_TYPES" >> $GITHUB_OUTPUT
 
-          # Read test matrix for training
-          TRAIN_MATRIX=$(yq '.test_matrix.functional.train | tojson(0)' "$CONFIG_FILE")
-          echo "train_test_matrix=$TRAIN_MATRIX" >> $GITHUB_OUTPUT
-
-          # Read ignored tests list from test_matrix.unit (default to empty array if not defined)
-          IGNORED_TESTS=$(yq '.test_matrix.unit.ignored_tests // [] | tojson(0)' "$CONFIG_FILE")
-          echo "ignored_tests=$IGNORED_TESTS" >> $GITHUB_OUTPUT
+          # Read setup script path
+          SETUP_SCRIPT=$(yq '.setup_script // ""' "$CONFIG_FILE")
+          echo "setup_script=$SETUP_SCRIPT" >> $GITHUB_OUTPUT
 
           # Read build environment variables (default to empty object if not defined)
           BUILD_ENV=$(yq '.build_env // {} | tojson(0)' "$CONFIG_FILE")
           echo "build_env=$BUILD_ENV" >> $GITHUB_OUTPUT
 
   unit_tests:
-    needs: checkout_and_config
+    name: unit_tests
+    if: inputs.run_unit_tests
+    needs: 
+      - checkout_and_config
     strategy:
       fail-fast: false
       matrix:
         device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
     uses: ./.github/workflows/unit_tests_common.yml
-    name: unit_tests
     with:
       platform: ${{ inputs.platform }}
       device: ${{ matrix.device }}
@@ -98,24 +96,61 @@ jobs:
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-      setup_commands: ${{ inputs.setup_commands }}
-      ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
       build_env: ${{ needs.checkout_and_config.outputs.build_env }}
 
-  # arguments.py not compatible with megatron-core-fl
-  # functional_tests:
-  #   needs:
-  #     - checkout_and_config
-  #   if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
-  #   uses: ./.github/workflows/functional_tests_common.yml
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+  unit_tests_complete:
+    name: unit_tests_complete
+    needs: 
+      - unit_tests
+    runs-on: ubuntu-latest
+    if: always() && inputs.run_unit_tests
+    steps:
+      - name: Check unit tests result
+        run: |
+          if [ "${{ needs.unit_tests.result }}" != "success" ] && \
+             [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed: ${{ needs.unit_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Unit tests passed"
 
+  integration_tests:
+    name: integration_tests
+    if: inputs.run_integration_tests
+    needs:
+      - checkout_and_config
+      - unit_tests_complete
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+    uses: ./.github/workflows/integration_tests_common.yml
+    with:
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
+      build_env: ${{ needs.checkout_and_config.outputs.build_env }}
+
+  integration_tests_complete:
+    name: integration_tests_complete
+    if: always() && inputs.run_integration_tests
+    needs: 
+      - integration_tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check integration tests result
+        run: |
+          if [ "${{ needs.integration_tests.result }}" != "success" ] && \
+             [ "${{ needs.integration_tests.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed: ${{ needs.integration_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Integration tests passed"
 
   all_tests_complete:
     defaults:
@@ -123,8 +158,8 @@ jobs:
         shell: bash
     needs:
       - checkout_and_config
-      - unit_tests
-      # - functional_tests
+      - unit_tests_complete
+      - integration_tests_complete
     runs-on: ubuntu-latest
     if: always()
     steps:
@@ -133,15 +168,17 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
-            echo "❌ Unit tests failed"
+          if [ "${{ needs.unit_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.unit_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed or cancelled: ${{ needs.unit_tests_complete.result }}"
             failed=true
           fi
 
-          # if [ "${{ needs.functional_tests.result }}" != "success" ]; then
-          #   echo "❌ Training functional tests failed"
-          #   failed=true
-          # fi
+          if [ "${{ needs.integration_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.integration_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed or cancelled: ${{ needs.integration_tests_complete.result }}"
+            failed=true
+          fi
 
           if [ "$failed" = "true" ]; then
             exit 1
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 0aa652f64b..cc7ade9f50 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -17,6 +17,8 @@ jobs:
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: cuda
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
index d3e496c4b2..0af545e291 100644
--- a/.github/workflows/all_tests_metax.yml
+++ b/.github/workflows/all_tests_metax.yml
@@ -13,15 +13,12 @@ concurrency:
 
 jobs:
   run_tests:
+    # Package manager and environment settings are read from .github/configs/metax.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: metax
-      # Metax Environment Setup
-      setup_commands: |
-        export PATH=/opt/conda/bin:$PATH        
-        export LD_LIBRARY_PATH=/usr/local/maca/lib:/opt/maca/lib:$LD_LIBRARY_PATH        
-        which python3
-        python3 -m pip --version
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6c9c967950..2ef6d1893d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 
 # A workflow to trigger TE build on GitHub
+
 name: 'Build'
 on:
   pull_request:
@@ -10,28 +11,56 @@ on:
 jobs:
   pytorch:
     name: 'PyTorch'
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash
     container:
       image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
-      options: --user root
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull never
     steps:
+      - name: Configure Git Safe Directory on Cuda
+        run: /usr/bin/git config --global safe.directory '*'
+
       - name: 'Checkout'
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
+          fetch-depth: 0
           submodules: recursive
-      - name: 'Build'
-        run:
+          set-safe-directory: true
+
+      - name: 'Setup Environment'
+        run: |
           source /opt/miniconda3/etc/profile.d/conda.sh
           conda activate flagscale-train
-          pip install --no-build-isolation . -v --no-deps
+          echo "PATH=$PATH" >> $GITHUB_ENV
+
+      - name: 'Build'
+        run: |
+          pip uninstall transformer_engine transformer_engine_torch -y || true
+          echo "GITHUB_WORKSPACE=$GITHUB_WORKSPACE"
+          cd $GITHUB_WORKSPACE
+          pip install nvdlfw-inspect
+          pip install expecttest
+          pip install . -v --no-deps --no-build-isolation
         env:
           NVTE_FRAMEWORK: pytorch
-          TE_WITH_NCCL: 1
+          TE_WITH_NCCL: '1'
+          NVTE_WITH_CUDA: '1'
+          CUDA_HOME: /usr/local/cuda-12.8
+          NVCC: /usr/local/cuda-12.8/bin/nvcc
+
       - name: 'Sanity check'
         run: 
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
           python3 tests/pytorch/test_sanity_import.py
diff --git a/.github/workflows/functional_tests_common.yml b/.github/workflows/functional_tests_common.yml
deleted file mode 100644
index aa6b734778..0000000000
--- a/.github/workflows/functional_tests_common.yml
+++ /dev/null
@@ -1,190 +0,0 @@
-# Disabled for compatibility issues
-name: Common Functional Tests - Training
-
-on:
-  workflow_call:
-    inputs:
-      platform:
-        required: true
-        type: string
-        description: Platform name (e.g., cuda, default)
-      test_matrix:
-        required: true
-        type: string
-        description: JSON array of test configurations
-      image:
-        required: true
-        type: string
-      runs_on:
-        required: true
-        type: string
-      container_volumes:
-        required: true
-        type: string
-      container_options:
-        required: true
-        type: string
-
-jobs:
-  functional_test_train:
-    defaults:
-      run:
-        shell: bash
-    env:
-      PROJECT_ROOT: ${{ github.workspace }}
-    runs-on: ${{ fromJson(inputs.runs_on) }}
-    strategy:
-      fail-fast: false
-      matrix:
-        test_config: ${{ fromJson(inputs.test_matrix) }}
-    container:
-      image: ${{ inputs.image }}
-      ports:
-        - 80
-      volumes: ${{ fromJson(inputs.container_volumes) }}
-      options: ${{ inputs.container_options }}
-
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      
-      # - name: Set safe directory
-      #   run: |
-      #     git config --global --add safe.directory $PROJECT_ROOT
-      ## The above step is commented out because there is no git cli in the container, and it causes the step to fail. The safe directory is set in the next step with a conditional check.
-      - name: Set safe directory
-        run: |
-          command -v git && git config --global --add safe.directory $PROJECT_ROOT || true
-
-      - name: Activate Python environment
-        run: |
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base   
-          echo "PATH=$PATH" >> $GITHUB_ENV
-    
-      - name: Setup Python environment
-        env:
-          NVTE_WITH_MACA: '1'
-          NVTE_WITH_CUDA: '0'
-          NVCC: /opt/maca/bin/mcc
-          CUDA_HOME: /opt/maca
-          
-          PATH: /opt/maca/bin:${{ env.PATH }}
-          LD_LIBRARY_PATH: /opt/maca/lib:${{ env.LD_LIBRARY_PATH }}
-        run: |
-          set -euo pipefail
-          cd $PROJECT_ROOT
-          pip install -e . --no-deps --no-build-isolation
-        timeout-minutes: 60
-
-      - name: L0 Pytorch Wheel
-        id: L0_pytoech_wheel
-        # timeout-minutes: 50
-        env:
-          TE_PATH: .
-          RUN_LOG: /logs/pytorch/wheel
-        run: |
-          echo "TE_PATH: ${TE_PATH}"
-          sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
-          sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh
-
-          cat qa/L0_pytorch_wheel/test.sh
-          # source /opt/miniconda3/etc/profile.d/conda.sh
-          # conda activate flagscale-train
-          pip uninstall -y transformer_engine
-
-          set -euo pipefail
-          cd $PROJECT_ROOT
-
-          PLATFORM='${{ inputs.platform }}'
-          DEVICE='${{ matrix.test_config.device }}'
-          TASK='${{ matrix.test_config.task }}'
-          MODEL='${{ matrix.test_config.model }}'
-          CASE='${{ matrix.test_config.case }}'
-
-          echo "Running functional tests for training"
-          echo "Platform: $PLATFORM"
-          echo "Device: $DEVICE"
-          echo "Task: $TASK"
-          echo "Model: $MODEL"
-          echo "Case: ${CASE:-all}"
-
-          # Set environment variables
-          export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-          
-          set +e
-          bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log
-          exit_code=$?
-          set -e
-
-          if [ $exit_code -eq 0 ]; then
-            echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-          else
-            echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-          fi
-
-          echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-          exit $exit_code
-
-      - name: Upload Installation Logs
-        if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
-        uses: actions/upload-artifact@v4
-        with:
-          name: L0-pytorch-logs-${{ github.run_id }}
-          path: /logs/pytorch/wheel
-          retention-days: 7
-          if-no-files-found: warn
-
-      # - name: Run functional tests
-      #   id: functional_test
-      #   run: |
-      #     set -euo pipefail
-      #     cd $PROJECT_ROOT
-
-      #     PLATFORM='${{ inputs.platform }}'
-      #     DEVICE='${{ matrix.test_config.device }}'
-      #     TASK='${{ matrix.test_config.task }}'
-      #     MODEL='${{ matrix.test_config.model }}'
-      #     CASE='${{ matrix.test_config.case }}'
-
-      #     echo "Running functional tests for training"
-      #     echo "Platform: $PLATFORM"
-      #     echo "Device: $DEVICE"
-      #     echo "Task: $TASK"
-      #     echo "Model: $MODEL"
-      #     echo "Case: ${CASE:-all}"
-
-      #     # Set environment variables
-      #     export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-
-      #     # Run functional tests via run_tests.sh with explicit platform/device/task/model/case
-      #     set +e
-      #     bash "$PROJECT_ROOT/tests/test_utils/runners/run_tests.sh" \
-      #       --platform "$PLATFORM" \
-      #       --device "$DEVICE" \
-      #       --type functional \
-      #       --task "$TASK" \
-      #       --model "$MODEL" \
-      #       --list "$CASE"
-      #     exit_code=$?
-      #     set -e
-
-      #     if [ $exit_code -eq 0 ]; then
-      #       echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-      #     else
-      #       echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-      #     fi
-
-      #     echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-      #     exit $exit_code
-      #   timeout-minutes: 60
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 60 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 3600
-      #   timeout-minutes: 60
\ No newline at end of file
diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
new file mode 100644
index 0000000000..25f18c866d
--- /dev/null
+++ b/.github/workflows/integration_tests_common.yml
@@ -0,0 +1,134 @@
+name: Common Integration Tests
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+      device:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      runs_on:
+        required: true
+        type: string
+      container_volumes:
+        required: true
+        type: string
+      container_options:
+        required: true
+        type: string
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
+        required: false
+        type: string
+        default: ''
+      # Platform-specific build environment variables (JSON object from config)
+      build_env:
+        required: false
+        type: string
+        default: '{}'
+
+jobs:
+  integration_test:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ fromJson(inputs.runs_on) }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_group:
+          - name: pytorch_mcore_integration
+            path: "qa/L1_pytorch_mcore_integration/test.sh"
+            test_type: "integration"
+    name: integration-${{ inputs.device }}-${{ matrix.test_group.name }}
+    container:
+      image: ${{ inputs.image }}
+      volumes: ${{ fromJson(inputs.container_volumes) }}
+      options: --pull never ${{ inputs.container_options }}
+
+    steps:
+      # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
+      - name: Configure Git Safe Directory on Cuda
+        if: inputs.platform == 'cuda'
+        run: /usr/bin/git config --global safe.directory '*'
+
+      - name: Checkout Source Code on Cuda (attempt 1)
+        id: checkout1
+        if: inputs.platform == 'cuda'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 2)
+        id: checkout2
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 3)
+        id: checkout3
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
+      # Metax no need submodules
+      - name: Checkout Source Code on Metax
+        if: inputs.platform == 'metax'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      
+      - name: Environment Setup
+        if: inputs.setup_script != ''
+        run: |
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
+
+      - name: Execute Tests
+        env:
+          TE_PATH: ${{ github.workspace }}
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
+        run: |
+          set -euo pipefail
+
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          # python3 --version
+          # pip list | grep -E "regex|six|torch" || true
+
+          bash ${{ matrix.test_group.path }}
+        timeout-minutes: 30
+        
\ No newline at end of file
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index b026f9aa10..f214990581 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -2,21 +2,11 @@ name: QA L0 - Core Unit & Lint Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
-
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index 51f071aa3b..32a13813ff 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -2,32 +2,11 @@ name: QA L1 - Comprehensive Integration Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-      
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
@@ -57,8 +36,8 @@ jobs:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
         with:
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }}
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || github.ref_name }}
           ssh-strict: true
           ssh-user: git
           persist-credentials: true
@@ -166,3 +145,21 @@ jobs:
           echo "=== Running L1 PyTorch ONNX Unit Tests ==="
           bash ./qa/L1_pytorch_onnx_unittest/test.sh
         # timeout-minutes: 30
+
+            
+      - name: Run L1 PyTorch Megatron-FL MCore Integration Test
+        env:
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          bash ./qa/L1_pytorch_mcore_integration/test.sh
+        timeout-minutes: 30
diff --git a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
index 9a881dd2d9..bb3e0a73fe 100644
--- a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
+++ b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
@@ -3,16 +3,11 @@ name: QA L3 - Attention Tests
 
 on:
   push:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
-
+    branches: 
+      - __disabled_do_not_remove__
   pull_request:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml
index f487673444..9b640fcce8 100644
--- a/.github/workflows/te-plugin-tests.yml
+++ b/.github/workflows/te-plugin-tests.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   run-plugin-tests:
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash
@@ -35,7 +35,7 @@ jobs:
         --ulimit stack=67108864 
         --ulimit nofile=65535:65535 
         --user root
-        --pull always
+        --pull never
     steps:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index 615f7c9001..10a070d9df 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -1,6 +1,5 @@
 name: Common Unit Tests
 
-
 on:
   workflow_call:
     inputs:
@@ -22,12 +21,8 @@ on:
       container_options:
         required: true
         type: string
-      ignored_tests:
-        required: false
-        type: string
-        default: ''
-      # New input for hardware-specific initialization (e.g., conda activate)
-      setup_commands:
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
         required: false
         type: string
         default: ''
@@ -36,41 +31,9 @@ on:
         required: false
         type: string
         default: '{}'
-      # Whether to upload coverage report
-      upload_coverage:
-        description: "Whether to upload coverage report"
-        required: false
-        type: boolean
-        default: true
 
 jobs:
-  # 1. Change Detection
-  detect_changes:
-    runs-on: ubuntu-latest
-    outputs:
-      core: ${{ steps.filter.outputs.core }}
-      qa_l0: ${{ steps.filter.outputs.qa_l0 }}
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Detect changed paths
-        id: filter
-        run: |
-          set -euo pipefail
-          BASE_REF="${{ github.event_name == 'pull_request' && format('origin/{0}', github.base_ref) || 'HEAD~1' }}"
-          [ "${{ github.event_name }}" == "pull_request" ] && git fetch origin ${{ github.base_ref }} --depth=1
-          
-          CHANGED_FILES=$(git diff --name-only $BASE_REF...HEAD 2>/dev/null || git diff --name-only $BASE_REF HEAD)
-
-          echo "core=$(echo "$CHANGED_FILES" | grep -qE "^tests/unit_tests/|^megatron/core/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-          echo "qa_l0=$(echo "$CHANGED_FILES" | grep -qE "^qa/L0_|^transformer_engine/|^tests/pytorch/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-  
-  # 2. Unified Test Execution
   unit_test:
-    needs: detect_changes
     defaults:
       run:
         shell: bash
@@ -79,16 +42,15 @@ jobs:
       fail-fast: false
       matrix:
         test_group:
-          - name: pytorch_lint
-            path: "qa/L0_pytorch_lint/test.sh"
-            test_type: "lint"
           - name: pytorch_debug
             path: "qa/L0_pytorch_debug_unittest/test.sh"
             test_type: "debug"
           - name: pytorch_unittest
             path: "qa/L0_pytorch_unittest/test.sh"
             test_type: "unittest"
-
+          - name: pytorch_distributed_unittest
+            path: "qa/L1_pytorch_distributed_unittest/test.sh"
+            test_type: "unittest"
     name: unit-${{ inputs.device }}-${{ matrix.test_group.name }}
     container:
       image: ${{ inputs.image }}
@@ -96,33 +58,14 @@ jobs:
       options: --pull never ${{ inputs.container_options }}
 
     steps:
-      - name: Check if tests should run
-        id: should_run
-        run: |
-          echo "should_run=true" >> $GITHUB_OUTPUT
-          GROUP='${{ matrix.test_group.name }}'
-          # Force run if 'full ci' label exists
-          if [ "${{ contains(github.event.pull_request.labels.*.name, 'full ci') }}" == "true" ]; then 
-            echo "should_run=true" >> $GITHUB_OUTPUT; exit 0
-          fi
-
-          if [[ "$GROUP" == "pytorch_"* ]]; then
-            CHANGED='${{ needs.detect_changes.outputs.qa_l0 }}'
-          else
-            CHANGED='${{ needs.detect_changes.outputs.core }}'
-          fi
-          
-          # For debugging, you can force this to true
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
       - name: Configure Git Safe Directory on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         run: /usr/bin/git config --global safe.directory '*'
 
       - name: Checkout Source Code on Cuda (attempt 1)
         id: checkout1
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -132,7 +75,7 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 2)
         id: checkout2
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -142,116 +85,33 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 3)
         id: checkout3
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
           submodules: recursive
           set-safe-directory: true
 
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
       # Metax no need submodules
       - name: Checkout Source Code on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
+        if: inputs.platform == 'metax'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
       
-      - name: Environment Setup on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+      - name: Environment Setup
+        if: inputs.setup_script != ''
         run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Remove Existing TransformerEngine ====="
-          pip uninstall transformer_engine transformer_engine_torch -y || true
-
-          echo "===== Step 2: Build & Install TransformerEngine ====="
-          cd $GITHUB_WORKSPACE
-
-          pip install nvdlfw-inspect --quiet
-          pip install expecttest --quiet
-          pip install . -v --no-deps --no-build-isolation
-
-          echo "===== Step 3: Verify Installation ====="
-          python3 tests/pytorch/test_sanity_import.py
-
-          echo "===== Environment Setup Complete ===== "
-
-      - name: Environment Setup on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
-        run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Base Environment Setup ====="
-          # Configure MACA toolchain paths
-          export PATH=/opt/maca/bin:$PATH
-          export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
-          service ssh restart
-
-          echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
-          # TransformerEngine expects nvcc, but MACA provides cucc
-          ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
-          which nvcc || true
-
-          echo "===== Step 3: Install Required System Tools ====="
-          # Install essential build tools (avoid modifying Python dependencies)
-          apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
-
-          echo "===== Step 4: Remove Existing TransformerEngine ====="
-          # Prevent conflicts with preinstalled or incompatible versions
-          python3 -m pip uninstall transformer_engine -y || true
-          python3 -m pip install nvdlfw-inspect --quiet
-          python3 -m pip install expecttest --quiet
-
-          # echo "===== Step 5: Install Metax Binary Backend ====="
-          # # Install prebuilt Metax backend (required for MACA operators)
-          # WHL_PATH="/home/muxiuser/transformer_engine_metax-2.9.0-cp312-cp312-linux_x86_64.whl"
-          # if [ ! -f "$WHL_PATH" ]; then
-          #   echo "ERROR: Wheel file not found at $WHL_PATH"
-          #   echo "Please verify volume mount: -v /home/muxiuser:/home/muxiuser"
-          #   exit 1
-          # fi
-
-          # # Use --no-deps to avoid overwriting Metax-optimized PyTorch
-          # python3 -m pip install "$WHL_PATH" --no-deps --force-reinstall
-
-          # echo "===== Step 6: Verify Metax Backend ====="
-          # # Ensure transformer_engine_torch is correctly loaded
-          # python3 - <<'EOF'
-          # import transformer_engine_torch as te
-          # print("Backend loaded successfully:", te)
-          # EOF
-
-          echo "===== Step 7: Install TE-FL Plugin Layer ====="
-          # Install TransformerEngine-FL Python layer (plugin logic)
-          # cd /workspace/TransformerEngine-FL
-          cd $GITHUB_WORKSPACE
-          TE_FL_SKIP_CUDA=1 python3 setup.py install
-
-          echo "===== Step 8: Final Verification ====="
-          # Verify both TE Python API and backend are functional
-          python3 - <<'EOF'
-          import transformer_engine
-          import transformer_engine_torch as te
-          print("transformer_engine:", transformer_engine)
-          print("transformer_engine_torch:", te)
-          EOF
-
-          echo "===== Environment Setup Complete ===== "
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
       
       - name: Execute Tests
-        if: steps.should_run.outputs.should_run == 'true'
         working-directory: ${{ github.workspace }}
         run: |
           set -euo pipefail
@@ -265,6 +125,16 @@ jobs:
           for k, v in env.items():
               print(f'{k}={v}')
           ")
+          
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
 
           export TE_PATH=$GITHUB_WORKSPACE
           export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
@@ -284,19 +154,14 @@ jobs:
 
           # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
           COVERAGE_ENABLED=false
-          if [ "${{ inputs.upload_coverage }}" = "true" ] && [ "${{ matrix.test_group.test_type }}" = "unittest" ]; then
-            if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
-              export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
-              COVERAGE_ENABLED=true
-            else
-              echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
-            fi
+          if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
+            export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+            COVERAGE_ENABLED=true
+          else
+            echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
           fi
 
-          if [[ "${{ matrix.test_group.name }}" == *"lint"* ]]; then
-            export CPP_ONLY=0
-            export PYTHON_ONLY=0
-          elif [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
+          if [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
             # Fail fast on backend/API mismatch before running the full test group.
             # Skip for debug group (does not use FP8/optimizer symbols).
             python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
@@ -313,12 +178,10 @@ jobs:
               --include="transformer_engine/*" 2>/dev/null \
               || echo "WARNING: No coverage data found"
           fi
-
           exit $exit_code
         timeout-minutes: 60
 
       - name: Upload Coverage Report
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: actions/upload-artifact@v4
         continue-on-error: true
         with:
@@ -327,7 +190,6 @@ jobs:
             coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json
 
       - name: Upload Coverage Report to FlagCICD
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: flagos-ai/FlagOps/actions/post-pytest-report@v2
         continue-on-error: true
         env:
@@ -336,12 +198,4 @@ jobs:
           backend_url: 'http://flagcicd-inner.flagos.net:8000/metrics/'
           user_id: '000000000000000000'
           report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json'
-          fail_on_error: 'false'
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 200 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 60000
-      #   timeout-minutes: 200
\ No newline at end of file
+          fail_on_error: 'false'
\ No newline at end of file
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
index f0c638223e..7500fd8427 160000
--- a/3rdparty/cudnn-frontend
+++ b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit f0c638223eac20a9676941a110c9ad9e9842941d
+Subproject commit 7500fd8427a24a76fadac9f2108106fd22c62737
diff --git a/3rdparty/googletest b/3rdparty/googletest
index a35bc7693c..94be250af7 160000
--- a/3rdparty/googletest
+++ b/3rdparty/googletest
@@ -1 +1 @@
-Subproject commit a35bc7693c117a048152beeb34f6aac354b9423f
+Subproject commit 94be250af7e14c58dcbf476972d2d7141551ff67
diff --git a/qa/L0_pytorch_debug_unittest/README.rst b/qa/L0_pytorch_debug_unittest/README.rst
new file mode 100644
index 0000000000..2ba6e9fb0c
--- /dev/null
+++ b/qa/L0_pytorch_debug_unittest/README.rst
@@ -0,0 +1,26 @@
+L0 PyTorch Debug Unittest
+=========================
+
+This directory contains the L0 PyTorch debug unittest runner.
+
+MetaX ignore rules
+------------------
+
+MetaX-specific ignored tests are maintained in one place in ``test.sh`` through
+the ``METAX_IGNORED_TESTS`` list.
+
+The main execution flow only calls a helper to decide whether a test should be
+skipped, instead of embedding platform-specific matching rules directly in the
+main logic.
+
+This keeps the script easier to maintain and makes it simpler to add new
+ignored cases later if needed.
+
+How to extend
+-------------
+
+If a new test needs to be skipped on MetaX:
+
+1. Add the full test path to ``METAX_IGNORED_TESTS`` in ``test.sh``.
+2. Avoid adding new platform-specific matching logic directly into the main
+   execution flow.
\ No newline at end of file
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 5be88dfe4a..2ab7340986 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -7,6 +7,7 @@
 : ${TE_PATH:=/opt/transformerengine}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
 : ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
+
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
@@ -20,24 +21,37 @@ FAIL=0
 # because it is not available on PyPI.
 pip install pytest==8.2.1
 
+METAX_IGNORED_TESTS=(
+    "$TE_PATH/tests/pytorch/test_numerics.py"
+    "$TE_PATH/tests/pytorch/test_sanity.py"
+)
+
+should_skip_on_metax() {
+    local test_path=$1
+
+    [ "$PLATFORM" = "metax" ] || return 1
+
+    local ignored_test
+    for ignored_test in "${METAX_IGNORED_TESTS[@]}"; do
+        if [ "$test_path" = "$ignored_test" ]; then
+            echo "[SKIP] Platform MetaX: Ignoring $test_path"
+            return 0
+        fi
+    done
+
+    return 1
+}
+
+
 run_test_step() {
     local xml_file=$1
     local test_path=$2
     local cmd=$3
 
-
-    if [ "$PLATFORM" = "metax" ]; then
-        case "$test_path" in
-            *"test_numerics.py" | *"test_api_features.py" | *"test_sanity.py")
-                echo "-------------------------------------------------------"
-                echo "[SKIP] Platform MetaX: Ignoring $test_path"
-                echo "-------------------------------------------------------"
-                return 0
-                ;;
-        esac
+    if should_skip_on_metax "$test_path"; then
+        return 0
     fi
 
-
     echo "-------------------------------------------------------"
     echo "[RUN] Executing: $test_path"
     eval "$cmd" || FAIL=1
@@ -70,8 +84,6 @@ run_test_step "test_perf.xml" "$TE_PATH/tests/pytorch/debug/test_perf.py" \
 "pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
 
 
-
-
 # Step 7: Sanity 2
 run_test_step "test_sanity_2.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
 "NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 \
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index 99a1370ac4..bc4362e23d 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -22,13 +22,11 @@ run_test_step() {
     local cmd=$3
     local label=$4
 
-
     if [ "$PLATFORM" = "metax" ]; then
         case "$test_path" in
             *"test_numerics.py" | \
             *"test_sanity.py" | \
             *"test_parallel_cross_entropy.py" | \
-            *"test_cuda_graphs.py" | \
             *"test_fused_rope.py" | \
             *"test_gqa.py" | \
             *"test_fused_optimizer.py" | \
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 04860a9729..46b54ed30d 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -15,29 +15,134 @@ function test_fail() {
 
 RET=0
 FAILED_CASES=""
+DEBUG_TESTS_READY=0
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
+# The current CUDA 12.8 test container hits a fused-attention runtime loader
+# issue, so keep the distributed numerics suite on the unfused attention path.
+export NVTE_FLASH_ATTN="${NVTE_FLASH_ATTN:-0}"
+export NVTE_FUSED_ATTN="${NVTE_FUSED_ATTN:-0}"
+export NVTE_UNFUSED_ATTN="${NVTE_UNFUSED_ATTN:-1}"
+
+# Make CUDA runtime libraries discoverable for fused attention kernels.
+if [ -z "${CUDA_HOME:-}" ]; then
+    if [ -d /usr/local/cuda ]; then
+        export CUDA_HOME=/usr/local/cuda
+    elif [ -d /usr/local/cuda-12.8 ]; then
+        export CUDA_HOME=/usr/local/cuda-12.8
+    fi
+fi
+export CUDA_PATH="${CUDA_PATH:-${CUDA_HOME:-}}"
+
+CUDA_LIB_DIRS=()
+for path in \
+    "${CUDA_HOME:-}/lib64" \
+    "${CUDA_HOME:-}/targets/x86_64-linux/lib" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "torch" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "nvidia" / "cuda_runtime" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)"; do
+    if [ -n "$path" ] && [ -d "$path" ]; then
+        CUDA_LIB_DIRS+=("$path")
+    fi
+done
+
+if [ "${#CUDA_LIB_DIRS[@]}" -gt 0 ]; then
+    CUDA_LIB_PATH="$(IFS=:; echo "${CUDA_LIB_DIRS[*]}")"
+    export LD_LIBRARY_PATH="${CUDA_LIB_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+fi
+
+python3 - <<'PY'
+import ctypes
+
+for name in ("libcudart.so", "libcudart.so.12"):
+    try:
+        ctypes.CDLL(name, mode=ctypes.RTLD_GLOBAL)
+        print(f"[CUDA] Preloaded {name}")
+        break
+    except OSError as exc:
+        print(f"[CUDA] Failed to preload {name}: {exc}")
+PY
+
 
 # It is not installed as a requirement,
 # because it is not available on PyPI.
 pip uninstall -y nvdlfw-inspect
-pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+if pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git && \
+   python3 -c "import nvdlfw_inspect.api" >/dev/null 2>&1; then
+    DEBUG_TESTS_READY=1
+else
+    echo "Warning: nvdlfw_inspect is unavailable; debug numerics test will be skipped"
+fi
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+    local label=$4
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | \
+            *"test_numerics_exact.py" | \
+            *"test_torch_fsdp2.py" | \
+            *"test_cast_master_weights_to_fp8.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $label"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $label"
+    eval "$cmd" || test_fail "$label"
+}
+
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
+run_test_step "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"test_numerics.py"
+run_test_step "pytest_test_numerics_exact.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"test_numerics_exact.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k "not (test_distributed)" || test_fail "test_torch_fsdp2.py"
+run_test_step "pytest_test_torch_fsdp2.xml" "$TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k 'not (test_distributed)'" \
+"test_torch_fsdp2.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
+run_test_step "pytest_test_cp_utils.xml" "$TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"test_cp_utils.py"
+run_test_step "pytest_test_cast_master_weights_to_fp8.xml" "$TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"test_cast_master_weights_to_fp8.py"
 
 
 # debug tests
@@ -50,7 +155,13 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 
 # pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+if [ "$DEBUG_TESTS_READY" -eq 1 ]; then
+    run_test_step "pytest_test_numerics_2.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "test_numerics.py (debug)"
+else
+    echo "Skipping debug test_numerics.py because nvdlfw_inspect is unavailable"
+fi
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index a5130a52d3..b4ccb8f9ad 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -4,69 +4,149 @@
 
 set -e
 
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+retry_command() {
+    local attempts=$1
+    local delay_seconds=$2
+    shift 2
+
+    local attempt
+    for attempt in $(seq 1 "${attempts}"); do
+        if "$@"; then
+            return 0
+        fi
+        if [ "${attempt}" -lt "${attempts}" ]; then
+            echo "Command failed (attempt ${attempt}/${attempts}): $*"
+            echo "Retrying in ${delay_seconds}s..."
+            sleep "${delay_seconds}"
+        fi
+    done
+
+    echo "Command failed after ${attempts} attempts: $*"
+    return 1
+}
+
 # Paths
-: ${TE_PATH:=/opt/transformerengine}
-: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+: "${TE_PATH:=$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
+: "${MCORE_PATH:=/workspace/Megatron-LM-FL}"
+: "${MCORE_REPO_URL:=https://github.com/flagos-ai/Megatron-LM-FL.git}"
+: "${MCORE_REF:=main}"
+: "${OUTPUT_DIR:=${TE_PATH}/qa/L1_pytorch_mcore_integration/output}"
+: "${DATA_CACHE_PATH:=/tmp/data_cache}"
 
 # Check whether FP8 is supported
-DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
-if [[ ${DEVICE_ARCH} -ge 89 ]]; then
-    WITH_FP8=1
+WITH_FP8=
+if command -v nvidia-smi &>/dev/null; then
+    DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+    if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+        WITH_FP8=1
+    fi
+elif command -v mx-smi &>/dev/null; then
+    # Metax hardware does not support FP8; leave WITH_FP8 unset
+    :
 fi
 
-# Download Megatron-LM if needed
+# Download or sync Megatron-LM-FL to the requested repo/ref.
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
-    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    git config --global --unset-all credential.helper 2>/dev/null || true
+    git config --system --unset-all credential.helper 2>/dev/null || true
+    retry_command 3 5 git clone --depth 1 -b "${MCORE_REF}" "${MCORE_REPO_URL}" $(basename ${MCORE_PATH})
     popd
 fi
 
-# Create mock vocab
-VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
-printf "" > ${VOCAB_FILE}
-printf "{" >> ${VOCAB_FILE}
-printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
-seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
-printf "}" >> ${VOCAB_FILE}
+if [ -d "${MCORE_PATH}/.git" ]; then
+    git -C "${MCORE_PATH}" remote set-url origin "${MCORE_REPO_URL}"
+    retry_command 3 5 git -C "${MCORE_PATH}" fetch --depth 1 origin "${MCORE_REF}"
+    git -C "${MCORE_PATH}" checkout -B "${MCORE_REF}" "FETCH_HEAD"
+fi
+
+# Megatron-LM-FL tokenizer imports happen at module import time, so direct
+# source execution needs these Python deps available before pretrain_gpt.py
+# starts.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six regex
+import regex
+import six
+print(f"six available: {six.__version__}")
+print(f"regex available: {regex.__version__}")
+PY
+
+CHECKPOINT_DIR=${OUTPUT_DIR}/checkpoints
+TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
+mkdir -p "${CHECKPOINT_DIR}" "${TENSORBOARD_DIR}" "${DATA_CACHE_PATH}" /tmp/checkpoints
+
+echo "Using Megatron-LM-FL repo: ${MCORE_REPO_URL}"
+echo "Using Megatron-LM-FL ref: ${MCORE_REF}"
+git -C "${MCORE_PATH}" rev-parse --short HEAD
 
-# Megatron-LM invocation
+# Megatron-LM-FL invocation. Keep the argument shape aligned with the
+# previously validated tp1/pp1 mock-data GPT functional case while letting CI
+# exit after a few steps.
 COMMAND="
 NVTE_TORCH_COMPILE=0
 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-NVTE_FLASH_ATTN=1
-NVTE_FWD_LAYERNORM_SM_MARGIN=0
-NVTE_BWD_LAYERNORM_SM_MARGIN=0
 CUDA_DEVICE_MAX_CONNECTIONS=1
-NVTE_BIAS_GELU_NVFUSION=0
-NVTE_BIAS_DROPOUT_FUSION=0
+NCCL_ALGO=Ring
+CUBLAS_WORKSPACE_CONFIG=:4096:8
 
-python3
--m torch.distributed.launch
---use_env
+torchrun
 --nnodes=1
 --nproc_per_node=1
 
 ${MCORE_PATH}/pretrain_gpt.py
 --tensor-model-parallel-size 1
 --pipeline-model-parallel-size 1
---use-cpu-initialization
---num-layers 2
---hidden-size 128
+--num-layers 12
+--hidden-size 512
 --num-attention-heads 8
---seq-length 128
---max-position-embeddings 128
---micro-batch-size 1
---global-batch-size 8
---train-iters 10
+--log-params-norm
+--log-num-zeros-in-grad
+--log-validation-ppl-to-tensorboard
+--log-timers-to-tensorboard
+--seq-length 1024
+--max-position-embeddings 1024
+--micro-batch-size 4
+--global-batch-size 32
+--train-iters 50
 --eval-iters 10
---lr 1e-4
+--timing-log-level 0
+--lr-decay-iters 320000
+--save ${CHECKPOINT_DIR}
+--split 949,50,1
+--tokenizer-type NullTokenizer
+--vocab-size 8192
 --mock-data
---vocab-file ${VOCAB_FILE}
---merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--distributed-backend nccl
+--lr 0.00015
+--lr-decay-style cosine
+--min-lr 1.0e-5
+--weight-decay 1e-2
+--clip-grad 1.0
+--lr-warmup-fraction .01
+--log-interval 1
+--save-interval 10000
+--eval-interval 1000
 --transformer-impl transformer_engine
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--deterministic-mode
+--no-gradient-accumulation-fusion
+--attention-softmax-in-fp32
+--use-mcore-models
+--ckpt-format torch_dist
+--dist-ckpt-optim-fully-reshardable
+--dist-ckpt-strictness log_all
+--data-cache-path ${DATA_CACHE_PATH}
+--bf16
+--attention-backend unfused
+--log-memory-to-tensorboard
+--tensorboard-dir ${TENSORBOARD_DIR}
+--exit-interval 4
 ${WITH_FP8:+--fp8-format hybrid}
 "
 COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
 
-# Launch Megatron-LM
+# Launch Megatron-LM-FL
 bash -c "${COMMAND}"
diff --git a/qa/L1_pytorch_mcore_integration/test_bak.sh b/qa/L1_pytorch_mcore_integration/test_bak.sh
new file mode 100644
index 0000000000..ec0b47b695
--- /dev/null
+++ b/qa/L1_pytorch_mcore_integration/test_bak.sh
@@ -0,0 +1,79 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Paths
+: ${TE_PATH:=/opt/transformerengine}
+: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+
+# Check whether FP8 is supported
+DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+    WITH_FP8=1
+fi
+
+# Download Megatron-LM if needed
+if [ ! -d "${MCORE_PATH}" ]; then
+    pushd $(dirname ${MCORE_PATH})
+    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    popd
+fi
+
+# Megatron tokenizer import chain pulls in bert_tokenization at module import
+# time, which unconditionally depends on `six`.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six
+import six
+print(f"six available: {six.__version__}")
+PY
+
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
+# Megatron-LM invocation
+COMMAND="
+NVTE_TORCH_COMPILE=0
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+NVTE_FLASH_ATTN=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+CUDA_DEVICE_MAX_CONNECTIONS=1
+NVTE_BIAS_GELU_NVFUSION=0
+NVTE_BIAS_DROPOUT_FUSION=0
+
+python3
+-m torch.distributed.launch
+--use_env
+--nnodes=1
+--nproc_per_node=1
+
+${MCORE_PATH}/pretrain_gpt.py
+--tensor-model-parallel-size 1
+--pipeline-model-parallel-size 1
+--use-cpu-initialization
+--num-layers 2
+--hidden-size 128
+--num-attention-heads 8
+--seq-length 128
+--max-position-embeddings 128
+--micro-batch-size 1
+--global-batch-size 8
+--train-iters 10
+--eval-iters 10
+--lr 1e-4
+--mock-data
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--transformer-impl transformer_engine
+${WITH_FP8:+--fp8-format hybrid}
+"
+COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
+
+# Launch Megatron-LM
+bash -c "${COMMAND}"
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 4309cc4a2e..4045997666 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -14,7 +14,6 @@ def _load_cuda_libs():
     import subprocess
     from pathlib import Path
     import importlib.util
-    import sysconfig
     import platform
     import glob as glob_module
 
@@ -154,7 +153,9 @@ def get_attention_backend(self, attention_params=None):
                      fused_attention_backend, use_unfused_attention, available_backends)
         """
         # Import the original get_attention_backend function
-        from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
+        from transformer_engine.pytorch.attention.dot_product_attention import (
+            utils as dpa_utils,
+        )
 
         return dpa_utils._original_get_attention_backend(attention_params)
 
@@ -536,7 +537,15 @@ def layernorm_fwd(
         tex = self._get_tex()
         otype = tex.DType(int(otype)) if otype is not None else None
         return tex.layernorm_fwd(
-            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+            input,
+            weight,
+            bias,
+            eps,
+            ln_out,
+            quantizer,
+            otype,
+            sm_margin,
+            zero_centered_gamma,
         )
 
     def layernorm_bwd(
@@ -746,7 +755,12 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer,
+            amax_histories,
+            scales,
+            amax_compute_algo,
+            fp8_dtype,
+            margin,
         )
 
     def fp8_block_scaling_compute_partial_amax(
@@ -1028,7 +1042,14 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            input,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_rope_backward(
@@ -1293,7 +1314,13 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
+            out,
+            out_per_step,
+            lse,
+            lse_per_step,
+            cu_seqlens,
+            only_second_half,
+            lse_packed,
         )
 
     def thd_grad_correction(

From ae664ea4326f6c1f2bcdf6121b700dd748b02ef5 Mon Sep 17 00:00:00 2001
From: BrianPei <kaworu228@gmail.com>
Date: Fri, 24 Apr 2026 18:04:58 +0800
Subject: [PATCH 52/59] [CICD] Refactor workflows, Add integration_tests,
 Switch to FlagCICD metax runner (#60)

Refactors CI/CD workflows to support both CUDA (NVIDIA A100) and Metax
(C500) platforms, removes obsolete workflows, and fixes several
platform-specific test failures. Add functional testing, and log
reporting, with significant workflow simplification, and Metax platform
use BAAI runner configs.

---

- [x] New feature (non-breaking change which adds functionality)
- [x] Infra/Build change (changes to CI/CD workflows or build scripts)
- [x] Code refactoring
- [x] Bug fix
- [ ] Documentation change
- [ ] Breaking change

---

- **Workflow cleanup**: Removed 7 obsolete workflows; extracted lint
into a standalone reusable `lint_common.yml` (runs in parallel); add
`integration_tests_common.yml`
- **Platform refactoring**: Added per-platform setup scripts
(`setup_cuda.sh` / `setup_metax.sh`); switched Metax config to BAAI
online environment; removed unsupported test types (JAX distributed)
from Metax matrix
- **Bug fixes**:
- Metax: skip incompatible distributed test files (`test_numerics`,
`test_torch_fsdp2`, etc.) to prevent `torchrun` SIGSEGV
- Metax: replace `nvidia-smi`-only FP8 detection with platform-aware
check
- CUDA: fix `libcudart` load failure when runtime is pip-installed (add
proper fallback chain in `_load_cudart()` and `try_load_lib`)

---

- [x] I have read and followed the contributing guidelines
- [x] The functionality is complete
- [x] I have commented my code, particularly in CI workflow setup steps
- [x] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added/updated tests that prove my feature works on CUDA and
Metax platform
- [x] New and existing unit tests pass locally on CUDA and Metax
platform

---------

Co-authored-by: qqjxzxq <1376782660@qq.com>
Co-authored-by: HermiaHuan <3081497279@qq.com>
---
 .github/configs/cuda.yml                      |  25 +-
 .github/configs/metax.yml                     |  47 ++--
 .github/scripts/setup_cuda.sh                 |  25 ++
 .github/scripts/setup_metax.sh                |  50 ++++
 .github/workflows/all_tests_common.yml        | 123 ++++++----
 .github/workflows/all_tests_cuda.yml          |   2 +
 .github/workflows/all_tests_metax.yml         |   9 +-
 .github/workflows/build.yml                   | 178 ++++----------
 .github/workflows/functional_tests_common.yml | 190 ---------------
 .../workflows/integration_tests_common.yml    | 134 +++++++++++
 .../qa-l0-te-cpp-unittest-pytorch-lint.yml    |  18 +-
 .../workflows/qa-l1-te-cpp-pytorch-tests.yml  |  51 ++--
 .../qa-l3-te-pytorch-fa-versions-test.yml     |  13 +-
 .github/workflows/te-plugin-tests.yml         |   4 +-
 .github/workflows/unit_tests_common.yml       | 220 +++---------------
 qa/L0_pytorch_debug_unittest/README.rst       |  26 +++
 qa/L0_pytorch_debug_unittest/test.sh          |  62 +++--
 qa/L0_pytorch_unittest/test.sh                | 151 ++++++++----
 qa/L1_pytorch_distributed_unittest/test.sh    | 131 ++++++++++-
 qa/L1_pytorch_mcore_integration/test.sh       | 150 +++++++++---
 qa/L1_pytorch_mcore_integration/test_bak.sh   |  79 +++++++
 .../plugin/core/backends/vendor/cuda/cuda.py  |  39 +++-
 22 files changed, 960 insertions(+), 767 deletions(-)
 create mode 100755 .github/scripts/setup_cuda.sh
 create mode 100755 .github/scripts/setup_metax.sh
 delete mode 100644 .github/workflows/functional_tests_common.yml
 create mode 100644 .github/workflows/integration_tests_common.yml
 create mode 100644 qa/L0_pytorch_debug_unittest/README.rst
 create mode 100644 qa/L1_pytorch_mcore_integration/test_bak.sh

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 6975fab589..1c77fe6c25 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -1,26 +1,28 @@
 # CUDA Hardware Configuration for TransformerEngine-FL
-# Refactored for BAAI DGX A100 Nodes
+# Refactored for A100 Nodes
 # This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: cuda
 display_name: 'NVIDIA CUDA (A100)'
 
+# CI image for online env
 ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
 
 # Runner labels for self-hosted A100 node
 # runner_labels:
-# - self-hosted
-# - Linux
-# - X64
-# - nvidia
-# - gpu-8
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - nvidia
+#   - gpu-8
+
+# Runner labels for online env
 runner_labels:
   - nv-8g-cicd-te
 
 # Container volumes
 container_volumes:
   - /home/flagscale_cicd/flask/static:/workspace/report
-  # - /home/flagscale_cicd/data:/opt/data
 
 # Container options
 container_options: >-
@@ -32,9 +34,8 @@ container_options: >-
   --ulimit stack=67108864 
   --user root
 
-# Device types
-device_types:
-  - a100
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_cuda.sh
 
 # Build environment variables (platform-specific)
 build_env:
@@ -47,6 +48,10 @@ build_env:
   CUDA_HOME: /usr/local/cuda-12.8
   NVCC: /usr/local/cuda-12.8/bin/nvcc
 
+# Device types to run tests on
+device_types:
+  - a100
+
 # Test matrix configuration
 test_matrix:
   l0_pytorch:
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
index e3b10c892d..00b4e1df34 100644
--- a/.github/configs/metax.yml
+++ b/.github/configs/metax.yml
@@ -1,28 +1,33 @@
 # Metax Hardware Configuration for TE-FL
 # This file defines CI/CD settings for Metax-based testing
-# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+# This file defines environment variables, volumes, and test filters for TE tests.
 
 hardware_name: metax
 display_name: 'Metax Tests'
 
-ci_image: localhost:5000/megatron-lm-with-te:v1
-# ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+# CI image for Metax dev env
+# ci_image: localhost:5000/megatron-lm-with-te:v1
 
-runner_labels:
-  - self-hosted
-  - Linux
-  - X64
-  - metax
-  - dev
+# CI image for online env
+ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+
+# Runner labels for self-hosted Metax node
 # runner_labels:
-#   - mx-4g-cicd-te
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - metax
+#   - dev
+
+# Runner labels for online env
+runner_labels:
+  - mx-4g-cicd-te
 
+# Container volumes
 container_volumes:
   - /nfs/metax_fs:/nfs/metax_fs
-  - /dev/dri:/dev/dri
-  - /dev/mxcd:/dev/mxcd
-  - /dev/infiniband:/dev/infiniband
 
+# Container options
 container_options: >-
   --uts=host
   --ipc=host
@@ -30,17 +35,16 @@ container_options: >-
   --group-add video
   --shm-size=100gb
   --ulimit memlock=-1
-  --security-opt seccomp=unconfined
-  --security-opt apparmor=unconfined
-  --device=/dev/dri
-  --device=/dev/mxcd
-  --device=/dev/infiniband
   --user root
   --ulimit nofile=65535:65535
   -e PLATFORM=metax
   -e TORCH_DISTRIBUTED_BACKEND=mccl
   -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
 
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_metax.sh
+
+# Build environment variables (platform-specific)
 build_env:
   TE_FL_SKIP_CUDA: '1'
   NVTE_WITH_MACA: '1'
@@ -62,10 +66,3 @@ test_matrix:
       # example: tests/unit_tests/test_example.py
       # - tests/unit_tests/test_inference.py
       # - tests/unit_tests/test_rl_utils.py
-
-  # functional:
-  #   train:
-  #     - device: c500
-  #       task: train
-  #       model: deepseek
-  #       case: tp2_pp2_ep2
diff --git a/.github/scripts/setup_cuda.sh b/.github/scripts/setup_cuda.sh
new file mode 100755
index 0000000000..f9e289c6d0
--- /dev/null
+++ b/.github/scripts/setup_cuda.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# CUDA Platform Environment Setup Script
+# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/miniconda3/etc/profile.d/conda.sh
+conda activate flagscale-train
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Remove Existing TransformerEngine ====="
+pip uninstall transformer_engine transformer_engine_torch -y || true
+
+echo "===== Step 2: Build & Install TransformerEngine ====="
+cd $GITHUB_WORKSPACE
+
+pip install nvdlfw-inspect --quiet
+pip install expecttest --quiet
+pip install . -v --no-deps --no-build-isolation
+
+echo "===== Step 3: Verify Installation ====="
+python3 tests/pytorch/test_sanity_import.py
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
new file mode 100755
index 0000000000..a2d0b0a4cf
--- /dev/null
+++ b/.github/scripts/setup_metax.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Metax Platform Environment Setup Script
+# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/conda/etc/profile.d/conda.sh
+conda activate base
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Base Environment Setup ====="
+# Configure MACA toolchain paths
+export PATH=/opt/maca/bin:$PATH
+export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+service ssh restart
+
+echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+# TransformerEngine expects nvcc, but MACA provides cucc
+ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+which nvcc || true
+
+echo "===== Step 3: Install Required System Tools ====="
+# Use apt to install git, curl
+sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
+apt-get update -qq || true
+apt-get install -y -qq git curl
+# Install cmake and ninja via pip (more reliable than apt in this env)
+python3 -m pip install cmake ninja torch --no-cache-dir
+
+echo "===== Step 4: Remove Existing TransformerEngine ====="
+# Prevent conflicts with preinstalled or incompatible versions
+python3 -m pip uninstall transformer_engine -y || true
+python3 -m pip install nvdlfw-inspect --no-deps || true
+
+echo "===== Step 5: Install TE-FL Plugin Layer ====="
+# Install TransformerEngine-FL Python layer (plugin logic)
+cd $GITHUB_WORKSPACE
+TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+echo "===== Step 6: Final Verification ====="
+# Verify both TE Python API and backend are functional
+python3 - <<'EOF'
+import transformer_engine
+import transformer_engine_torch as te
+print("transformer_engine:", transformer_engine)
+print("transformer_engine_torch:", te)
+EOF
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 2165de9b49..606a0d3e86 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -7,13 +7,20 @@ on:
         required: true
         type: string
         description: Platform name (e.g., cuda, default)
-      setup_commands:
+      run_unit_tests:
         required: false
-        type: string
-        default: ''
+        type: boolean
+        default: true
+        description: Whether to run unit tests in this workflow
+      run_integration_tests:
+        required: false
+        type: boolean
+        default: true
+        description: Whether to run integration tests in this workflow
 
 jobs:
   checkout_and_config:
+    name: checkout_and_config
     defaults:
       run:
         shell: bash
@@ -24,19 +31,12 @@ jobs:
       container_volumes: ${{ steps.config.outputs.container_volumes }}
       container_options: ${{ steps.config.outputs.container_options }}
       device_types: ${{ steps.config.outputs.device_types }}
-      train_test_matrix: ${{ steps.config.outputs.train_test_matrix }}
-      ignored_tests: ${{ steps.config.outputs.ignored_tests }}
+      setup_script: ${{ steps.config.outputs.setup_script }}
       build_env: ${{ steps.config.outputs.build_env }}
     steps:
       - name: Checkout source code
         uses: actions/checkout@v4
 
-      - name: Check if tests should run
-        id: should_run
-        run: |
-         
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       - name: Load platform configuration
         id: config
         run: |
@@ -71,26 +71,24 @@ jobs:
           DEVICE_TYPES=$(yq '.device_types | tojson(0)' "$CONFIG_FILE")
           echo "device_types=$DEVICE_TYPES" >> $GITHUB_OUTPUT
 
-          # Read test matrix for training
-          TRAIN_MATRIX=$(yq '.test_matrix.functional.train | tojson(0)' "$CONFIG_FILE")
-          echo "train_test_matrix=$TRAIN_MATRIX" >> $GITHUB_OUTPUT
-
-          # Read ignored tests list from test_matrix.unit (default to empty array if not defined)
-          IGNORED_TESTS=$(yq '.test_matrix.unit.ignored_tests // [] | tojson(0)' "$CONFIG_FILE")
-          echo "ignored_tests=$IGNORED_TESTS" >> $GITHUB_OUTPUT
+          # Read setup script path
+          SETUP_SCRIPT=$(yq '.setup_script // ""' "$CONFIG_FILE")
+          echo "setup_script=$SETUP_SCRIPT" >> $GITHUB_OUTPUT
 
           # Read build environment variables (default to empty object if not defined)
           BUILD_ENV=$(yq '.build_env // {} | tojson(0)' "$CONFIG_FILE")
           echo "build_env=$BUILD_ENV" >> $GITHUB_OUTPUT
 
   unit_tests:
-    needs: checkout_and_config
+    name: unit_tests
+    if: inputs.run_unit_tests
+    needs: 
+      - checkout_and_config
     strategy:
       fail-fast: false
       matrix:
         device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
     uses: ./.github/workflows/unit_tests_common.yml
-    name: unit_tests
     with:
       platform: ${{ inputs.platform }}
       device: ${{ matrix.device }}
@@ -98,24 +96,61 @@ jobs:
       runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
       container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-      setup_commands: ${{ inputs.setup_commands }}
-      ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
       build_env: ${{ needs.checkout_and_config.outputs.build_env }}
 
-  # arguments.py not compatible with megatron-core-fl
-  # functional_tests:
-  #   needs:
-  #     - checkout_and_config
-  #   if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
-  #   uses: ./.github/workflows/functional_tests_common.yml
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+  unit_tests_complete:
+    name: unit_tests_complete
+    needs: 
+      - unit_tests
+    runs-on: ubuntu-latest
+    if: always() && inputs.run_unit_tests
+    steps:
+      - name: Check unit tests result
+        run: |
+          if [ "${{ needs.unit_tests.result }}" != "success" ] && \
+             [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed: ${{ needs.unit_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Unit tests passed"
 
+  integration_tests:
+    name: integration_tests
+    if: inputs.run_integration_tests
+    needs:
+      - checkout_and_config
+      - unit_tests_complete
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+    uses: ./.github/workflows/integration_tests_common.yml
+    with:
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      setup_script: ${{ needs.checkout_and_config.outputs.setup_script }}
+      build_env: ${{ needs.checkout_and_config.outputs.build_env }}
+
+  integration_tests_complete:
+    name: integration_tests_complete
+    if: always() && inputs.run_integration_tests
+    needs: 
+      - integration_tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check integration tests result
+        run: |
+          if [ "${{ needs.integration_tests.result }}" != "success" ] && \
+             [ "${{ needs.integration_tests.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed: ${{ needs.integration_tests.result }}"
+            exit 1
+          fi
+          echo "✅ Integration tests passed"
 
   all_tests_complete:
     defaults:
@@ -123,8 +158,8 @@ jobs:
         shell: bash
     needs:
       - checkout_and_config
-      - unit_tests
-      # - functional_tests
+      - unit_tests_complete
+      - integration_tests_complete
     runs-on: ubuntu-latest
     if: always()
     steps:
@@ -133,15 +168,17 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
-            echo "❌ Unit tests failed"
+          if [ "${{ needs.unit_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.unit_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed or cancelled: ${{ needs.unit_tests_complete.result }}"
             failed=true
           fi
 
-          # if [ "${{ needs.functional_tests.result }}" != "success" ]; then
-          #   echo "❌ Training functional tests failed"
-          #   failed=true
-          # fi
+          if [ "${{ needs.integration_tests_complete.result }}" != "success" ] && \
+            [ "${{ needs.integration_tests_complete.result }}" != "skipped" ]; then
+            echo "❌ Integration tests failed or cancelled: ${{ needs.integration_tests_complete.result }}"
+            failed=true
+          fi
 
           if [ "$failed" = "true" ]; then
             exit 1
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 0aa652f64b..cc7ade9f50 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -17,6 +17,8 @@ jobs:
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: cuda
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
index d3e496c4b2..0af545e291 100644
--- a/.github/workflows/all_tests_metax.yml
+++ b/.github/workflows/all_tests_metax.yml
@@ -13,15 +13,12 @@ concurrency:
 
 jobs:
   run_tests:
+    # Package manager and environment settings are read from .github/configs/metax.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: metax
-      # Metax Environment Setup
-      setup_commands: |
-        export PATH=/opt/conda/bin:$PATH        
-        export LD_LIBRARY_PATH=/usr/local/maca/lib:/opt/maca/lib:$LD_LIBRARY_PATH        
-        which python3
-        python3 -m pip --version
+      run_unit_tests: true
+      run_integration_tests: true
 
   all_tests:
     needs: run_tests
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3e85ac2114..2ef6d1893d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,160 +1,66 @@
-# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
 
 # A workflow to trigger TE build on GitHub
+
 name: 'Build'
 on:
   pull_request:
   workflow_dispatch:
-concurrency:
-  # Group by workflow name + PR number (for PRs) or ref (for branch/tag pushes)
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
 jobs:
-  core:
-    name: 'Core'
-    runs-on: ubuntu-latest
-    container:
-      image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
-      options: --user root
-    steps:
-      - name: 'Dependencies'
-        run: |
-          apt-get update
-          apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake==3.21.0 pybind11[global] ninja
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-      - name: ccache
-        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
-      - name: 'Build'
-        run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v
-        env:
-          NVTE_FRAMEWORK: none
-          MAX_JOBS: 1
-          SCCACHE_GHA_ENABLED: "true"
-      - name: 'Sanity check'
-        run: python3 -c "import transformer_engine"
-        working-directory: /
   pytorch:
     name: 'PyTorch'
-    runs-on: ubuntu-latest
+    runs-on: [ nv-8g-cicd-te ]
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+      ports:
+        - 80:80
+      options: >-
+        --gpus all 
+        --shm-size=500g 
+        --privileged 
+        --ipc=host 
+        --ulimit memlock=-1 
+        --ulimit stack=67108864 
+        --ulimit nofile=65535:65535 
+        --user root
+        --pull never
     steps:
-      - name: Move /var/lib/docker/
-        shell: bash -euxo pipefail {0}
-        run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
-
-      - name: Maximize build space
-        uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
-        with:
-          root-reserve-mb: 5120
-          temp-reserve-mb: 32
-          swap-size-mb: 10240
-          remove-dotnet: 'true'
-          remove-android: 'true'
-          remove-haskell: 'true'
-          remove-codeql: 'true'
-          build-mount-path: '/var/lib/docker/'
-
-      - name: Restore /var/lib/docker/
-        shell: bash -euxo pipefail {0}
-        run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
+      - name: Configure Git Safe Directory on Cuda
+        run: /usr/bin/git config --global safe.directory '*'
 
       - name: 'Checkout'
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
+          fetch-depth: 0
           submodules: recursive
+          set-safe-directory: true
 
-      - name: Start named container
-        run: |
-          docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity
-
-      - name: 'Dependencies'
+      - name: 'Setup Environment'
         run: |
-          docker exec builder bash -c '\
-            apt-get update && \
-            apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
-            pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
-            apt-get clean \
-          '
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+          echo "PATH=$PATH" >> $GITHUB_ENV
 
-      - name: 'Build'
-        run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
-        env:
-          NVTE_FRAMEWORK: pytorch
-          TE_WITH_NCCL: 1
-      - name: 'Sanity check'
-        run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
-  jax:
-    name: 'JAX'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/nvidia/jax:jax
-      options: --user root
-    steps:
-      - name: 'Dependencies'
-        run: pip install cmake==3.21.0 pybind11[global]
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-      - name: ccache
-        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
       - name: 'Build'
         run: |
-          NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
+          pip uninstall transformer_engine transformer_engine_torch -y || true
+          echo "GITHUB_WORKSPACE=$GITHUB_WORKSPACE"
+          cd $GITHUB_WORKSPACE
+          pip install nvdlfw-inspect
+          pip install expecttest
+          pip install . -v --no-deps --no-build-isolation
         env:
-          NVTE_FRAMEWORK: jax
-          MAX_JOBS: 1
-          SCCACHE_GHA_ENABLED: "true"
-      - name: 'Sanity check'
-        run: python3 tests/jax/test_sanity_import.py
-  all:
-    name: 'All'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Move /var/lib/docker/
-        shell: bash -euxo pipefail {0}
-        run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
-
-      - name: Maximize build space
-        uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
-        with:
-          root-reserve-mb: 5120
-          temp-reserve-mb: 32
-          swap-size-mb: 10240
-          remove-dotnet: 'true'
-          remove-android: 'true'
-          remove-haskell: 'true'
-          remove-codeql: 'true'
-          build-mount-path: '/var/lib/docker/'
-
-      - name: Restore /var/lib/docker/
-        shell: bash -euxo pipefail {0}
-        run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
-
-      - name: 'Checkout'
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Start named container
-        run: |
-          docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity
+          NVTE_FRAMEWORK: pytorch
+          TE_WITH_NCCL: '1'
+          NVTE_WITH_CUDA: '1'
+          CUDA_HOME: /usr/local/cuda-12.8
+          NVCC: /usr/local/cuda-12.8/bin/nvcc
 
-      - name: 'Dependencies'
-        run: |
-          docker exec builder bash -c '\
-            pip install cmake==3.21.0 pybind11[global] einops onnxscript && \
-            pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
-          '
-      - name: 'Build'
-        run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps'
-        env:
-          NVTE_FRAMEWORK: all
-          MAX_JOBS: 1
       - name: 'Sanity check'
-        run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py'
+        run: 
+          python3 tests/pytorch/test_sanity_import.py
diff --git a/.github/workflows/functional_tests_common.yml b/.github/workflows/functional_tests_common.yml
deleted file mode 100644
index aa6b734778..0000000000
--- a/.github/workflows/functional_tests_common.yml
+++ /dev/null
@@ -1,190 +0,0 @@
-# Disabled for compatibility issues
-name: Common Functional Tests - Training
-
-on:
-  workflow_call:
-    inputs:
-      platform:
-        required: true
-        type: string
-        description: Platform name (e.g., cuda, default)
-      test_matrix:
-        required: true
-        type: string
-        description: JSON array of test configurations
-      image:
-        required: true
-        type: string
-      runs_on:
-        required: true
-        type: string
-      container_volumes:
-        required: true
-        type: string
-      container_options:
-        required: true
-        type: string
-
-jobs:
-  functional_test_train:
-    defaults:
-      run:
-        shell: bash
-    env:
-      PROJECT_ROOT: ${{ github.workspace }}
-    runs-on: ${{ fromJson(inputs.runs_on) }}
-    strategy:
-      fail-fast: false
-      matrix:
-        test_config: ${{ fromJson(inputs.test_matrix) }}
-    container:
-      image: ${{ inputs.image }}
-      ports:
-        - 80
-      volumes: ${{ fromJson(inputs.container_volumes) }}
-      options: ${{ inputs.container_options }}
-
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      
-      # - name: Set safe directory
-      #   run: |
-      #     git config --global --add safe.directory $PROJECT_ROOT
-      ## The above step is commented out because there is no git cli in the container, and it causes the step to fail. The safe directory is set in the next step with a conditional check.
-      - name: Set safe directory
-        run: |
-          command -v git && git config --global --add safe.directory $PROJECT_ROOT || true
-
-      - name: Activate Python environment
-        run: |
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base   
-          echo "PATH=$PATH" >> $GITHUB_ENV
-    
-      - name: Setup Python environment
-        env:
-          NVTE_WITH_MACA: '1'
-          NVTE_WITH_CUDA: '0'
-          NVCC: /opt/maca/bin/mcc
-          CUDA_HOME: /opt/maca
-          
-          PATH: /opt/maca/bin:${{ env.PATH }}
-          LD_LIBRARY_PATH: /opt/maca/lib:${{ env.LD_LIBRARY_PATH }}
-        run: |
-          set -euo pipefail
-          cd $PROJECT_ROOT
-          pip install -e . --no-deps --no-build-isolation
-        timeout-minutes: 60
-
-      - name: L0 Pytorch Wheel
-        id: L0_pytoech_wheel
-        # timeout-minutes: 50
-        env:
-          TE_PATH: .
-          RUN_LOG: /logs/pytorch/wheel
-        run: |
-          echo "TE_PATH: ${TE_PATH}"
-          sed -i "s/^cd transformer_engine\/pytorch\s*$/pushd transformer_engine\/pytorch/" qa/L0_pytorch_wheel/test.sh
-          sed -i '44 s/^cd \s*\$TE_PATH\s*$/popd/' qa/L0_pytorch_wheel/test.sh
-
-          cat qa/L0_pytorch_wheel/test.sh
-          # source /opt/miniconda3/etc/profile.d/conda.sh
-          # conda activate flagscale-train
-          pip uninstall -y transformer_engine
-
-          set -euo pipefail
-          cd $PROJECT_ROOT
-
-          PLATFORM='${{ inputs.platform }}'
-          DEVICE='${{ matrix.test_config.device }}'
-          TASK='${{ matrix.test_config.task }}'
-          MODEL='${{ matrix.test_config.model }}'
-          CASE='${{ matrix.test_config.case }}'
-
-          echo "Running functional tests for training"
-          echo "Platform: $PLATFORM"
-          echo "Device: $DEVICE"
-          echo "Task: $TASK"
-          echo "Model: $MODEL"
-          echo "Case: ${CASE:-all}"
-
-          # Set environment variables
-          export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-          
-          set +e
-          bash qa/L0_pytorch_wheel/test.sh | tee ${RUN_LOG}/pytorch_wheel-${{ github.run_id }}.log
-          exit_code=$?
-          set -e
-
-          if [ $exit_code -eq 0 ]; then
-            echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-          else
-            echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-          fi
-
-          echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-          exit $exit_code
-
-      - name: Upload Installation Logs
-        if: always() && steps.L0_pytoech_wheel.outcome == 'failure'
-        uses: actions/upload-artifact@v4
-        with:
-          name: L0-pytorch-logs-${{ github.run_id }}
-          path: /logs/pytorch/wheel
-          retention-days: 7
-          if-no-files-found: warn
-
-      # - name: Run functional tests
-      #   id: functional_test
-      #   run: |
-      #     set -euo pipefail
-      #     cd $PROJECT_ROOT
-
-      #     PLATFORM='${{ inputs.platform }}'
-      #     DEVICE='${{ matrix.test_config.device }}'
-      #     TASK='${{ matrix.test_config.task }}'
-      #     MODEL='${{ matrix.test_config.model }}'
-      #     CASE='${{ matrix.test_config.case }}'
-
-      #     echo "Running functional tests for training"
-      #     echo "Platform: $PLATFORM"
-      #     echo "Device: $DEVICE"
-      #     echo "Task: $TASK"
-      #     echo "Model: $MODEL"
-      #     echo "Case: ${CASE:-all}"
-
-      #     # Set environment variables
-      #     export PYTHONPATH=$PROJECT_ROOT:${PYTHONPATH:-}
-
-      #     # Run functional tests via run_tests.sh with explicit platform/device/task/model/case
-      #     set +e
-      #     bash "$PROJECT_ROOT/tests/test_utils/runners/run_tests.sh" \
-      #       --platform "$PLATFORM" \
-      #       --device "$DEVICE" \
-      #       --type functional \
-      #       --task "$TASK" \
-      #       --model "$MODEL" \
-      #       --list "$CASE"
-      #     exit_code=$?
-      #     set -e
-
-      #     if [ $exit_code -eq 0 ]; then
-      #       echo "✅ Functional tests passed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE"
-      #     else
-      #       echo "❌ Functional tests failed for $PLATFORM/$DEVICE/$TASK/$MODEL/$CASE (exit code: $exit_code)"
-      #     fi
-
-      #     echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
-      #     exit $exit_code
-      #   timeout-minutes: 60
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 60 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 3600
-      #   timeout-minutes: 60
\ No newline at end of file
diff --git a/.github/workflows/integration_tests_common.yml b/.github/workflows/integration_tests_common.yml
new file mode 100644
index 0000000000..25f18c866d
--- /dev/null
+++ b/.github/workflows/integration_tests_common.yml
@@ -0,0 +1,134 @@
+name: Common Integration Tests
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+      device:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      runs_on:
+        required: true
+        type: string
+      container_volumes:
+        required: true
+        type: string
+      container_options:
+        required: true
+        type: string
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
+        required: false
+        type: string
+        default: ''
+      # Platform-specific build environment variables (JSON object from config)
+      build_env:
+        required: false
+        type: string
+        default: '{}'
+
+jobs:
+  integration_test:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ fromJson(inputs.runs_on) }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_group:
+          - name: pytorch_mcore_integration
+            path: "qa/L1_pytorch_mcore_integration/test.sh"
+            test_type: "integration"
+    name: integration-${{ inputs.device }}-${{ matrix.test_group.name }}
+    container:
+      image: ${{ inputs.image }}
+      volumes: ${{ fromJson(inputs.container_volumes) }}
+      options: --pull never ${{ inputs.container_options }}
+
+    steps:
+      # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
+      - name: Configure Git Safe Directory on Cuda
+        if: inputs.platform == 'cuda'
+        run: /usr/bin/git config --global safe.directory '*'
+
+      - name: Checkout Source Code on Cuda (attempt 1)
+        id: checkout1
+        if: inputs.platform == 'cuda'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 2)
+        id: checkout2
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        uses: actions/checkout@v4
+        continue-on-error: true
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      - name: Checkout Source Code on Cuda (attempt 3)
+        id: checkout3
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          set-safe-directory: true
+
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
+      # Metax no need submodules
+      - name: Checkout Source Code on Metax
+        if: inputs.platform == 'metax'
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      
+      - name: Environment Setup
+        if: inputs.setup_script != ''
+        run: |
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
+
+      - name: Execute Tests
+        env:
+          TE_PATH: ${{ github.workspace }}
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
+        run: |
+          set -euo pipefail
+
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          # python3 --version
+          # pip list | grep -E "regex|six|torch" || true
+
+          bash ${{ matrix.test_group.path }}
+        timeout-minutes: 30
+        
\ No newline at end of file
diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
index b026f9aa10..f214990581 100644
--- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
+++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml
@@ -2,21 +2,11 @@ name: QA L0 - Core Unit & Lint Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml'
-      - 'qa/L0_pytorch_lint/**'
-      - 'transformer_engine/**'
-      - 'tests/pytorch/**'
-
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
index 51f071aa3b..32a13813ff 100644
--- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
+++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml
@@ -2,32 +2,11 @@ name: QA L1 - Comprehensive Integration Tests
 
 on:
   push:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-
+    branches:
+      - __disabled_do_not_remove__
   pull_request:
-    branches: main
-    paths:
-      - '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
-      - 'qa/L1_cpp_distributed/**'
-      - 'tests/cpp_distributed/**'
-      - 'qa/L1_pytorch_thunder_integration/**'
-      - 'qa/L1_pytorch_distributed_unittest/**'
-      - 'tests/pytorch/distributed/**'
-      - 'tests/pytorch/attention/**'
-      - 'qa/L1_pytorch_onnx_unittest/**'
-      - 'tests/pytorch/test_onnx_export.py'
-      
-  workflow_dispatch:
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
@@ -57,8 +36,8 @@ jobs:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
         with:
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }}
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || github.ref_name }}
           ssh-strict: true
           ssh-user: git
           persist-credentials: true
@@ -166,3 +145,21 @@ jobs:
           echo "=== Running L1 PyTorch ONNX Unit Tests ==="
           bash ./qa/L1_pytorch_onnx_unittest/test.sh
         # timeout-minutes: 30
+
+            
+      - name: Run L1 PyTorch Megatron-FL MCore Integration Test
+        env:
+          TE_PATH: .
+          TE_FL_PREFER: vendor
+          MCORE_REPO_URL: https://github.com/flagos-ai/Megatron-LM-FL.git
+          MCORE_REF: main
+        run: |
+          # Activate conda environment
+          source /opt/miniconda3/etc/profile.d/conda.sh
+          conda activate flagscale-train
+
+          export TE_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/transformer_engine
+
+          echo "=== Running L1 PyTorch Megatron-FL MCore Integration Test ==="
+          bash ./qa/L1_pytorch_mcore_integration/test.sh
+        timeout-minutes: 30
diff --git a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
index 9a881dd2d9..bb3e0a73fe 100644
--- a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
+++ b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml
@@ -3,16 +3,11 @@ name: QA L3 - Attention Tests
 
 on:
   push:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
-
+    branches: 
+      - __disabled_do_not_remove__
   pull_request:
-    branches: __disable__
-    paths:
-      - '.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml'
-      - 'tests/pytorch/attention/test_attention.py'
+    branches:
+      - __disabled_do_not_remove__
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml
index f487673444..9b640fcce8 100644
--- a/.github/workflows/te-plugin-tests.yml
+++ b/.github/workflows/te-plugin-tests.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   run-plugin-tests:
-    runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
+    runs-on: [ nv-8g-cicd-te ]
     defaults:
       run:
         shell: bash
@@ -35,7 +35,7 @@ jobs:
         --ulimit stack=67108864 
         --ulimit nofile=65535:65535 
         --user root
-        --pull always
+        --pull never
     steps:
       - name: Checkout Code
         uses: actions/checkout@v6.0.1
diff --git a/.github/workflows/unit_tests_common.yml b/.github/workflows/unit_tests_common.yml
index 615f7c9001..10a070d9df 100644
--- a/.github/workflows/unit_tests_common.yml
+++ b/.github/workflows/unit_tests_common.yml
@@ -1,6 +1,5 @@
 name: Common Unit Tests
 
-
 on:
   workflow_call:
     inputs:
@@ -22,12 +21,8 @@ on:
       container_options:
         required: true
         type: string
-      ignored_tests:
-        required: false
-        type: string
-        default: ''
-      # New input for hardware-specific initialization (e.g., conda activate)
-      setup_commands:
+      # Platform-specific environment setup script path (from platform config)
+      setup_script:
         required: false
         type: string
         default: ''
@@ -36,41 +31,9 @@ on:
         required: false
         type: string
         default: '{}'
-      # Whether to upload coverage report
-      upload_coverage:
-        description: "Whether to upload coverage report"
-        required: false
-        type: boolean
-        default: true
 
 jobs:
-  # 1. Change Detection
-  detect_changes:
-    runs-on: ubuntu-latest
-    outputs:
-      core: ${{ steps.filter.outputs.core }}
-      qa_l0: ${{ steps.filter.outputs.qa_l0 }}
-    steps:
-      - name: Checkout source code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Detect changed paths
-        id: filter
-        run: |
-          set -euo pipefail
-          BASE_REF="${{ github.event_name == 'pull_request' && format('origin/{0}', github.base_ref) || 'HEAD~1' }}"
-          [ "${{ github.event_name }}" == "pull_request" ] && git fetch origin ${{ github.base_ref }} --depth=1
-          
-          CHANGED_FILES=$(git diff --name-only $BASE_REF...HEAD 2>/dev/null || git diff --name-only $BASE_REF HEAD)
-
-          echo "core=$(echo "$CHANGED_FILES" | grep -qE "^tests/unit_tests/|^megatron/core/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-          echo "qa_l0=$(echo "$CHANGED_FILES" | grep -qE "^qa/L0_|^transformer_engine/|^tests/pytorch/|^.github/" && echo "true" || echo "false")" >> $GITHUB_OUTPUT
-  
-  # 2. Unified Test Execution
   unit_test:
-    needs: detect_changes
     defaults:
       run:
         shell: bash
@@ -79,16 +42,15 @@ jobs:
       fail-fast: false
       matrix:
         test_group:
-          - name: pytorch_lint
-            path: "qa/L0_pytorch_lint/test.sh"
-            test_type: "lint"
           - name: pytorch_debug
             path: "qa/L0_pytorch_debug_unittest/test.sh"
             test_type: "debug"
           - name: pytorch_unittest
             path: "qa/L0_pytorch_unittest/test.sh"
             test_type: "unittest"
-
+          - name: pytorch_distributed_unittest
+            path: "qa/L1_pytorch_distributed_unittest/test.sh"
+            test_type: "unittest"
     name: unit-${{ inputs.device }}-${{ matrix.test_group.name }}
     container:
       image: ${{ inputs.image }}
@@ -96,33 +58,14 @@ jobs:
       options: --pull never ${{ inputs.container_options }}
 
     steps:
-      - name: Check if tests should run
-        id: should_run
-        run: |
-          echo "should_run=true" >> $GITHUB_OUTPUT
-          GROUP='${{ matrix.test_group.name }}'
-          # Force run if 'full ci' label exists
-          if [ "${{ contains(github.event.pull_request.labels.*.name, 'full ci') }}" == "true" ]; then 
-            echo "should_run=true" >> $GITHUB_OUTPUT; exit 0
-          fi
-
-          if [[ "$GROUP" == "pytorch_"* ]]; then
-            CHANGED='${{ needs.detect_changes.outputs.qa_l0 }}'
-          else
-            CHANGED='${{ needs.detect_changes.outputs.core }}'
-          fi
-          
-          # For debugging, you can force this to true
-          echo "should_run=true" >> $GITHUB_OUTPUT
-
       # Cuda requires git safe.directory configuration and 3 checkout attempts to handle submodule-heavy repos
       - name: Configure Git Safe Directory on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         run: /usr/bin/git config --global safe.directory '*'
 
       - name: Checkout Source Code on Cuda (attempt 1)
         id: checkout1
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+        if: inputs.platform == 'cuda'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -132,7 +75,7 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 2)
         id: checkout2
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout1.outcome == 'failure'
         uses: actions/checkout@v4
         continue-on-error: true
         with:
@@ -142,116 +85,33 @@ jobs:
 
       - name: Checkout Source Code on Cuda (attempt 3)
         id: checkout3
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
+        if: inputs.platform == 'cuda' && steps.checkout2.outcome == 'failure'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
           submodules: recursive
           set-safe-directory: true
 
+      # Metax requires to clean vscode-remote-container
+      - name: Configure Clean Git Env on Metax
+        if: inputs.platform == 'metax'
+        run: |
+          git config --global --unset-all credential.helper 2>/dev/null || true
+          git config --system --unset-all credential.helper 2>/dev/null || true
+
       # Metax no need submodules
       - name: Checkout Source Code on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
+        if: inputs.platform == 'metax'
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
       
-      - name: Environment Setup on Cuda
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'cuda'
+      - name: Environment Setup
+        if: inputs.setup_script != ''
         run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/miniconda3/etc/profile.d/conda.sh
-          conda activate flagscale-train
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Remove Existing TransformerEngine ====="
-          pip uninstall transformer_engine transformer_engine_torch -y || true
-
-          echo "===== Step 2: Build & Install TransformerEngine ====="
-          cd $GITHUB_WORKSPACE
-
-          pip install nvdlfw-inspect --quiet
-          pip install expecttest --quiet
-          pip install . -v --no-deps --no-build-isolation
-
-          echo "===== Step 3: Verify Installation ====="
-          python3 tests/pytorch/test_sanity_import.py
-
-          echo "===== Environment Setup Complete ===== "
-
-      - name: Environment Setup on Metax
-        if: steps.should_run.outputs.should_run == 'true' && inputs.platform == 'metax'
-        run: |
-          set -euo pipefail
-
-          echo "===== Step 0: Activate Python environment ====="
-          source /opt/conda/etc/profile.d/conda.sh
-          conda activate base
-          echo "PATH=$PATH" >> $GITHUB_ENV
-          echo "Python: $(which python3) ($(python3 --version 2>&1))"
-
-          echo "===== Step 1: Base Environment Setup ====="
-          # Configure MACA toolchain paths
-          export PATH=/opt/maca/bin:$PATH
-          export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
-          service ssh restart
-
-          echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
-          # TransformerEngine expects nvcc, but MACA provides cucc
-          ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
-          which nvcc || true
-
-          echo "===== Step 3: Install Required System Tools ====="
-          # Install essential build tools (avoid modifying Python dependencies)
-          apt-get update -qq && apt-get install -y -qq git cmake ninja-build curl
-
-          echo "===== Step 4: Remove Existing TransformerEngine ====="
-          # Prevent conflicts with preinstalled or incompatible versions
-          python3 -m pip uninstall transformer_engine -y || true
-          python3 -m pip install nvdlfw-inspect --quiet
-          python3 -m pip install expecttest --quiet
-
-          # echo "===== Step 5: Install Metax Binary Backend ====="
-          # # Install prebuilt Metax backend (required for MACA operators)
-          # WHL_PATH="/home/muxiuser/transformer_engine_metax-2.9.0-cp312-cp312-linux_x86_64.whl"
-          # if [ ! -f "$WHL_PATH" ]; then
-          #   echo "ERROR: Wheel file not found at $WHL_PATH"
-          #   echo "Please verify volume mount: -v /home/muxiuser:/home/muxiuser"
-          #   exit 1
-          # fi
-
-          # # Use --no-deps to avoid overwriting Metax-optimized PyTorch
-          # python3 -m pip install "$WHL_PATH" --no-deps --force-reinstall
-
-          # echo "===== Step 6: Verify Metax Backend ====="
-          # # Ensure transformer_engine_torch is correctly loaded
-          # python3 - <<'EOF'
-          # import transformer_engine_torch as te
-          # print("Backend loaded successfully:", te)
-          # EOF
-
-          echo "===== Step 7: Install TE-FL Plugin Layer ====="
-          # Install TransformerEngine-FL Python layer (plugin logic)
-          # cd /workspace/TransformerEngine-FL
-          cd $GITHUB_WORKSPACE
-          TE_FL_SKIP_CUDA=1 python3 setup.py install
-
-          echo "===== Step 8: Final Verification ====="
-          # Verify both TE Python API and backend are functional
-          python3 - <<'EOF'
-          import transformer_engine
-          import transformer_engine_torch as te
-          print("transformer_engine:", transformer_engine)
-          print("transformer_engine_torch:", te)
-          EOF
-
-          echo "===== Environment Setup Complete ===== "
+          bash $GITHUB_WORKSPACE/${{ inputs.setup_script }}
       
       - name: Execute Tests
-        if: steps.should_run.outputs.should_run == 'true'
         working-directory: ${{ github.workspace }}
         run: |
           set -euo pipefail
@@ -265,6 +125,16 @@ jobs:
           for k, v in env.items():
               print(f'{k}={v}')
           ")
+          
+          # Activate conda environment
+          if ${{inputs.platform == 'metax'}}; then
+            source /opt/conda/etc/profile.d/conda.sh
+            conda activate base
+          else
+            source /opt/miniconda3/etc/profile.d/conda.sh
+            conda activate flagscale-train
+          fi
+          echo "PATH=$PATH" >> $GITHUB_ENV
 
           export TE_PATH=$GITHUB_WORKSPACE
           export TE_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
@@ -284,19 +154,14 @@ jobs:
 
           # Coverage setup: install once + configure collection via PYTEST_ADDOPTS
           COVERAGE_ENABLED=false
-          if [ "${{ inputs.upload_coverage }}" = "true" ] && [ "${{ matrix.test_group.test_type }}" = "unittest" ]; then
-            if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
-              export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
-              COVERAGE_ENABLED=true
-            else
-              echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
-            fi
+          if pip3 install coverage pytest-cov --quiet 2>/dev/null; then
+            export PYTEST_ADDOPTS="--cov=transformer_engine --cov-append --cov-report="
+            COVERAGE_ENABLED=true
+          else
+            echo "WARNING: Failed to install coverage/pytest-cov, coverage collection disabled"
           fi
 
-          if [[ "${{ matrix.test_group.name }}" == *"lint"* ]]; then
-            export CPP_ONLY=0
-            export PYTHON_ONLY=0
-          elif [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
+          if [[ "${{ matrix.test_group.name }}" != *"debug"* ]]; then
             # Fail fast on backend/API mismatch before running the full test group.
             # Skip for debug group (does not use FP8/optimizer symbols).
             python3 -c "import sys, importlib; import transformer_engine.common as _te_common; tex = importlib.import_module('transformer_engine_torch'); required=['multi_tensor_scale','multi_tensor_compute_scale_and_scale_inv']; missing=[n for n in required if not hasattr(tex, n)]; print('[TE check] module:', tex); print('[TE check] file:', getattr(tex, '__file__', 'N/A')); print('[TE check] missing:', ', '.join(missing) if missing else 'none'); sys.exit(1 if missing else 0)"
@@ -313,12 +178,10 @@ jobs:
               --include="transformer_engine/*" 2>/dev/null \
               || echo "WARNING: No coverage data found"
           fi
-
           exit $exit_code
         timeout-minutes: 60
 
       - name: Upload Coverage Report
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: actions/upload-artifact@v4
         continue-on-error: true
         with:
@@ -327,7 +190,6 @@ jobs:
             coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json
 
       - name: Upload Coverage Report to FlagCICD
-        if: inputs.upload_coverage && matrix.test_group.test_type == 'unittest'
         uses: flagos-ai/FlagOps/actions/post-pytest-report@v2
         continue-on-error: true
         env:
@@ -336,12 +198,4 @@ jobs:
           backend_url: 'http://flagcicd-inner.flagos.net:8000/metrics/'
           user_id: '000000000000000000'
           report_path: 'coverage-${{ inputs.platform }}-${{ inputs.device }}-${{ matrix.test_group.name }}.json'
-          fail_on_error: 'false'
-
-      # - name: Debug - keep container alive on failure
-      #   if: failure()
-      #   run: |
-      #     echo "Container sleeping for 200 minutes for debugging..."
-      #     echo "On host, run: docker ps  then  docker exec -it <container_id> bash"
-      #     sleep 60000
-      #   timeout-minutes: 200
\ No newline at end of file
+          fail_on_error: 'false'
\ No newline at end of file
diff --git a/qa/L0_pytorch_debug_unittest/README.rst b/qa/L0_pytorch_debug_unittest/README.rst
new file mode 100644
index 0000000000..2ba6e9fb0c
--- /dev/null
+++ b/qa/L0_pytorch_debug_unittest/README.rst
@@ -0,0 +1,26 @@
+L0 PyTorch Debug Unittest
+=========================
+
+This directory contains the L0 PyTorch debug unittest runner.
+
+MetaX ignore rules
+------------------
+
+MetaX-specific ignored tests are maintained in one place in ``test.sh`` through
+the ``METAX_IGNORED_TESTS`` list.
+
+The main execution flow only calls a helper to decide whether a test should be
+skipped, instead of embedding platform-specific matching rules directly in the
+main logic.
+
+This keeps the script easier to maintain and makes it simpler to add new
+ignored cases later if needed.
+
+How to extend
+-------------
+
+If a new test needs to be skipped on MetaX:
+
+1. Add the full test path to ``METAX_IGNORED_TESTS`` in ``test.sh``.
+2. Avoid adding new platform-specific matching logic directly into the main
+   execution flow.
\ No newline at end of file
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
index 30d956cc29..2ab7340986 100644
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -1,24 +1,13 @@
-# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
 
-function error_exit() {
-    echo "Error: $1"
-    exit 1
-}
-
-function test_fail() {
-    RET=1
-    FAILED_CASES="$FAILED_CASES $1"
-    echo "Error: sub-test failed: $1"
-}
 
-RET=0
-FAILED_CASES=""
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
 : ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
+
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
@@ -30,28 +19,42 @@ FAIL=0
 
 # It is not installed as a requirement,
 # because it is not available on PyPI.
-pip install pytest==8.2.1 || error_exit "Failed to install pytest"
+pip install pytest==8.2.1
+
+METAX_IGNORED_TESTS=(
+    "$TE_PATH/tests/pytorch/test_numerics.py"
+    "$TE_PATH/tests/pytorch/test_sanity.py"
+)
+
+should_skip_on_metax() {
+    local test_path=$1
+
+    [ "$PLATFORM" = "metax" ] || return 1
+
+    local ignored_test
+    for ignored_test in "${METAX_IGNORED_TESTS[@]}"; do
+        if [ "$test_path" = "$ignored_test" ]; then
+            echo "[SKIP] Platform MetaX: Ignoring $test_path"
+            return 0
+        fi
+    done
+
+    return 1
+}
+
 
 run_test_step() {
     local xml_file=$1
     local test_path=$2
     local cmd=$3
 
-    if [ "$PLATFORM" = "metax" ]; then
-        case "$test_path" in
-            *"test_numerics.py" | *"test_api_features.py" | *"test_sanity.py")
-                echo "-------------------------------------------------------"
-                echo "[SKIP] Platform MetaX: Ignoring $test_path"
-                echo "-------------------------------------------------------"
-                return 0
-                ;;
-        esac
+    if should_skip_on_metax "$test_path"; then
+        return 0
     fi
 
-
     echo "-------------------------------------------------------"
     echo "[RUN] Executing: $test_path"
-    eval "$cmd" || test_fail "$test_path"
+    eval "$cmd" || FAIL=1
 }
 
 
@@ -81,8 +84,6 @@ run_test_step "test_perf.xml" "$TE_PATH/tests/pytorch/debug/test_perf.py" \
 "pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR"
 
 
-
-# standard sanity and numerics tests with initialized debug
 # Step 7: Sanity 2
 run_test_step "test_sanity_2.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
 "NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 \
@@ -93,9 +94,4 @@ run_test_step "test_numerics_2.xml" "$TE_PATH/tests/pytorch/test_numerics.py" \
 "NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 \
 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py -k \"not (test_linear_accuracy or test_layernorm_linear_accuracy or test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_transformer_layer_hidden_states_format or test_grouped_gemm)\" --no-header"
 
-if [ "$RET" -ne 0 ]; then
-    echo "Error in the following test cases:$FAILED_CASES"
-    exit 1
-fi
-echo "All tests passed"
-exit 0
+exit $FAIL
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index caf10341de..bc4362e23d 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -1,11 +1,5 @@
-# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
+#!/bin/bash
 
-function error_exit() {
-    echo "Error: $1"
-    exit 1
-}
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
@@ -14,48 +8,123 @@ mkdir -p "$XML_LOG_DIR"
 pip install pytest==8.2.1
 FAIL=0
 
+IS_CUDA_BACKEND=$(python3 -c "import torch; print('cuda' if torch.cuda.is_available() else 'cpu')" 2>/dev/null)
+
 test_fail() {
     FAIL=1
     echo "Error: sub-test failed: $1"
 }
 
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py || test_fail "test_jit.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_mxfp8.xml $TE_PATH/tests/pytorch/mxfp8 || test_fail "test_mxfp8"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_quantized_tensor.xml $TE_PATH/tests/pytorch/test_quantized_tensor.py || test_fail "test_quantized_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_grouped_tensor.xml $TE_PATH/tests/pytorch/test_grouped_tensor.py || test_fail "test_grouped_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
-NVTE_CUTEDSL_FUSED_GROUPED_MLP=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
-NVTE_FLASH_ATTN=0 NVTE_CPU_OFFLOAD_V1=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading_v1.xml $TE_PATH/tests/pytorch/test_cpu_offloading_v1.py || test_fail "test_cpu_offloading_v1.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
-NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention_deterministic.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
-export NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint
-if [ ! -d "$NVTE_TEST_CHECKPOINT_ARTIFACT_PATH" ]; then
-    python3 $TE_PATH/tests/pytorch/test_checkpoint.py --save-checkpoint all || error_exit "Failed to generate checkpoint files"
-fi
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_partial_cast.xml $TE_PATH/tests/pytorch/test_partial_cast.py || test_fail "test_partial_cast.py"
+
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+    local label=$4
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | \
+            *"test_sanity.py" | \
+            *"test_parallel_cross_entropy.py" | \
+            *"test_fused_rope.py" | \
+            *"test_gqa.py" | \
+            *"test_fused_optimizer.py" | \
+            *"test_multi_tensor.py" | \
+            *"test_cpu_offloading.py" | \
+            *"test_attention.py" | \
+            *"test_kv_cache.py" | \
+            *"test_checkpoint.py" | \
+            *"test_fused_router.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $label"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+    if [[ "$IS_CUDA_BACKEND" == *"cuda"* ]]; then
+        if [[ "$test_path" == *"test_checkpoint.py" || "$test_path" == *"test_cpu_offloading.py" || "$test_path" == *"test_attention.py" ]]; then
+            echo "-------------------------------------------------------"
+            echo "[SKIP] CUDA Backend detected: Ignoring $label"
+            echo "-------------------------------------------------------"
+            return 0
+        fi
+    fi
+
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $label"
+
+    eval "$cmd" || test_fail "$label"
+}
+
+
+# Step: Sanity
+run_test_step "pytest_test_sanity.xml" "$TE_PATH/tests/pytorch/test_sanity.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py -k \"not (test_sanity_layernorm_mlp or test_sanity_gpt or test_sanity_bert or test_sanity_T5 or test_sanity_amp_and_nvfuser or test_sanity_drop_path or test_sanity_fused_qkv_params or test_sanity_gradient_accumulation_fusion or test_inference_mode or test_sanity_normalization_amp or test_sanity_layernorm_linear or test_sanity_linear_with_zero_tokens or test_sanity_grouped_linear)\" --no-header" "test_sanity.py"
+
+# Step: Recipe
+run_test_step "pytest_test_recipe.xml" "$TE_PATH/tests/pytorch/test_recipe.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py" "test_recipe.py"
+
+# Step: Deferred Init
+run_test_step "pytest_test_deferred_init.xml" "$TE_PATH/tests/pytorch/test_deferred_init.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py" "test_deferred_init.py"
+
+# Step: Numerics
+run_test_step "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/test_numerics.py" \
+"PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py -k \"not (test_layernorm_mlp_accuracy or test_grouped_linear_accuracy or test_gpt_cuda_graph or test_transformer_layer_hidden_states_format or test_grouped_gemm or test_noncontiguous or test_gpt_checkpointing or test_gpt_accuracy or test_mha_accuracy or test_linear_accuracy or test_linear_accuracy_delay_wgrad_compute or test_rmsnorm_accuracy or test_layernorm_accuracy or test_layernorm_linear_accuracy)\" --no-header" "test_numerics.py"
+
+# Step: CUDA Graphs
+run_test_step "pytest_test_cuda_graphs.xml" "$TE_PATH/tests/pytorch/test_cuda_graphs.py" \
+"PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py" "test_cuda_graphs.py"
+
+# Step: JIT
+run_test_step "pytest_test_jit.xml" "$TE_PATH/tests/pytorch/test_jit.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py -k \"not (test_torch_dynamo)\"" "test_jit.py"
+
+# Step: Fused Rope
+run_test_step "pytest_test_fused_rope.xml" "$TE_PATH/tests/pytorch/test_fused_rope.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py" "test_fused_rope.py"
+
+# Step: NVFP4 (Directory)
+run_test_step "pytest_test_nvfp4.xml" "$TE_PATH/tests/pytorch/nvfp4" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4" "test_nvfp4"
+
+# Step: Float8 Tensors
+run_test_step "pytest_test_float8tensor.xml" "$TE_PATH/tests/pytorch/test_float8tensor.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8tensor.xml $TE_PATH/tests/pytorch/test_float8tensor.py" "test_float8tensor.py"
+
+# Step: GQA
+run_test_step "pytest_test_gqa.xml" "$TE_PATH/tests/pytorch/test_gqa.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py" "test_gqa.py"
+
+# Step: Fused Optimizer
+run_test_step "pytest_test_fused_optimizer.xml" "$TE_PATH/tests/pytorch/test_fused_optimizer.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py" "test_fused_optimizer.py"
+
+# Step: Parallel Cross Entropy
+run_test_step "pytest_test_parallel_cross_entropy.xml" "$TE_PATH/tests/pytorch/test_parallel_cross_entropy.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py" "test_parallel_cross_entropy.py"
+
+# Step: CPU Offloading
+run_test_step "pytest_test_cpu_offloading.xml" "$TE_PATH/tests/pytorch/test_cpu_offloading.py" \
+"NVTE_FLASH_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py" "test_cpu_offloading.py"
+
+# Step: Attention
+run_test_step "pytest_test_attention.xml" "$TE_PATH/tests/pytorch/attention/test_attention.py" \
+"python3 -m pytest -s -v --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py" "test_attention.py"
+
+# Step: Checkpoint
+run_test_step "pytest_test_checkpoint.xml" "$TE_PATH/tests/pytorch/test_checkpoint.py" \
+"NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py" "test_checkpoint.py"
+
 
 if [ "$FAIL" -ne 0 ]; then
     echo "Some tests failed."
     exit 1
 fi
-echo "All tests passed."
+echo "All assigned tests passed (some might have been skipped)."
 exit 0
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 5bb7b1da23..46b54ed30d 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
 
@@ -15,23 +15,134 @@ function test_fail() {
 
 RET=0
 FAILED_CASES=""
+DEBUG_TESTS_READY=0
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
+# The current CUDA 12.8 test container hits a fused-attention runtime loader
+# issue, so keep the distributed numerics suite on the unfused attention path.
+export NVTE_FLASH_ATTN="${NVTE_FLASH_ATTN:-0}"
+export NVTE_FUSED_ATTN="${NVTE_FUSED_ATTN:-0}"
+export NVTE_UNFUSED_ATTN="${NVTE_UNFUSED_ATTN:-1}"
+
+# Make CUDA runtime libraries discoverable for fused attention kernels.
+if [ -z "${CUDA_HOME:-}" ]; then
+    if [ -d /usr/local/cuda ]; then
+        export CUDA_HOME=/usr/local/cuda
+    elif [ -d /usr/local/cuda-12.8 ]; then
+        export CUDA_HOME=/usr/local/cuda-12.8
+    fi
+fi
+export CUDA_PATH="${CUDA_PATH:-${CUDA_HOME:-}}"
+
+CUDA_LIB_DIRS=()
+for path in \
+    "${CUDA_HOME:-}/lib64" \
+    "${CUDA_HOME:-}/targets/x86_64-linux/lib" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "torch" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)" \
+    "$(python3 - <<'PY'
+import site
+from pathlib import Path
+
+for root in site.getsitepackages():
+    candidate = Path(root) / "nvidia" / "cuda_runtime" / "lib"
+    if candidate.exists():
+        print(candidate)
+        break
+PY
+)"; do
+    if [ -n "$path" ] && [ -d "$path" ]; then
+        CUDA_LIB_DIRS+=("$path")
+    fi
+done
+
+if [ "${#CUDA_LIB_DIRS[@]}" -gt 0 ]; then
+    CUDA_LIB_PATH="$(IFS=:; echo "${CUDA_LIB_DIRS[*]}")"
+    export LD_LIBRARY_PATH="${CUDA_LIB_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+fi
+
+python3 - <<'PY'
+import ctypes
+
+for name in ("libcudart.so", "libcudart.so.12"):
+    try:
+        ctypes.CDLL(name, mode=ctypes.RTLD_GLOBAL)
+        print(f"[CUDA] Preloaded {name}")
+        break
+    except OSError as exc:
+        print(f"[CUDA] Failed to preload {name}: {exc}")
+PY
+
+
+# It is not installed as a requirement,
+# because it is not available on PyPI.
+pip uninstall -y nvdlfw-inspect
+if pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git && \
+   python3 -c "import nvdlfw_inspect.api" >/dev/null 2>&1; then
+    DEBUG_TESTS_READY=1
+else
+    echo "Warning: nvdlfw_inspect is unavailable; debug numerics test will be skipped"
+fi
+
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
+run_test_step() {
+    local xml_file=$1
+    local test_path=$2
+    local cmd=$3
+    local label=$4
+
+    if [ "$PLATFORM" = "metax" ]; then
+        case "$test_path" in
+            *"test_numerics.py" | \
+            *"test_numerics_exact.py" | \
+            *"test_torch_fsdp2.py" | \
+            *"test_cast_master_weights_to_fp8.py")
+                echo "-------------------------------------------------------"
+                echo "[SKIP] Platform MetaX: Ignoring $label"
+                echo "-------------------------------------------------------"
+                return 0
+                ;;
+        esac
+    fi
+
+    echo "-------------------------------------------------------"
+    echo "[RUN] Executing: $label"
+    eval "$cmd" || test_fail "$label"
+}
+
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
+run_test_step "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+"test_numerics.py"
+run_test_step "pytest_test_numerics_exact.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py" \
+"test_numerics_exact.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k "not (test_distributed)" || test_fail "test_torch_fsdp2.py"
+run_test_step "pytest_test_torch_fsdp2.xml" "$TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py -k 'not (test_distributed)'" \
+"test_torch_fsdp2.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
 # python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
+run_test_step "pytest_test_cp_utils.xml" "$TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py" \
+"test_cp_utils.py"
+run_test_step "pytest_test_cast_master_weights_to_fp8.xml" "$TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py" \
+"test_cast_master_weights_to_fp8.py"
 
 
 # debug tests
@@ -44,7 +155,13 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 
 # pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+if [ "$DEBUG_TESTS_READY" -eq 1 ]; then
+    run_test_step "pytest_test_numerics_2.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "test_numerics.py (debug)"
+else
+    echo "Skipping debug test_numerics.py because nvdlfw_inspect is unavailable"
+fi
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
index 06beba8864..7405cdbb47 100644
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -4,69 +4,149 @@
 
 set -e
 
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+retry_command() {
+    local attempts=$1
+    local delay_seconds=$2
+    shift 2
+
+    local attempt
+    for attempt in $(seq 1 "${attempts}"); do
+        if "$@"; then
+            return 0
+        fi
+        if [ "${attempt}" -lt "${attempts}" ]; then
+            echo "Command failed (attempt ${attempt}/${attempts}): $*"
+            echo "Retrying in ${delay_seconds}s..."
+            sleep "${delay_seconds}"
+        fi
+    done
+
+    echo "Command failed after ${attempts} attempts: $*"
+    return 1
+}
+
 # Paths
-: ${TE_PATH:=/opt/transformerengine}
-: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+: "${TE_PATH:=$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
+: "${MCORE_PATH:=/workspace/Megatron-LM-FL}"
+: "${MCORE_REPO_URL:=https://github.com/flagos-ai/Megatron-LM-FL.git}"
+: "${MCORE_REF:=main}"
+: "${OUTPUT_DIR:=${TE_PATH}/qa/L1_pytorch_mcore_integration/output}"
+: "${DATA_CACHE_PATH:=/tmp/data_cache}"
 
 # Check whether FP8 is supported
-DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
-if [[ ${DEVICE_ARCH} -ge 89 ]]; then
-    WITH_FP8=1
+WITH_FP8=
+if command -v nvidia-smi &>/dev/null; then
+    DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+    if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+        WITH_FP8=1
+    fi
+elif command -v mx-smi &>/dev/null; then
+    # Metax hardware does not support FP8; leave WITH_FP8 unset
+    :
 fi
 
-# Download Megatron-LM if needed
+# Download or sync Megatron-LM-FL to the requested repo/ref.
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
-    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    git config --global --unset-all credential.helper 2>/dev/null || true
+    git config --system --unset-all credential.helper 2>/dev/null || true
+    retry_command 3 5 git clone --depth 1 -b "${MCORE_REF}" "${MCORE_REPO_URL}" $(basename ${MCORE_PATH})
     popd
 fi
 
-# Create mock vocab
-VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
-printf "" > ${VOCAB_FILE}
-printf "{" >> ${VOCAB_FILE}
-printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
-seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
-printf "}" >> ${VOCAB_FILE}
+if [ -d "${MCORE_PATH}/.git" ]; then
+    git -C "${MCORE_PATH}" remote set-url origin "${MCORE_REPO_URL}"
+    retry_command 3 5 git -C "${MCORE_PATH}" fetch --depth 1 origin "${MCORE_REF}"
+    git -C "${MCORE_PATH}" checkout -B "${MCORE_REF}" "FETCH_HEAD"
+fi
+
+# Megatron-LM-FL tokenizer imports happen at module import time, so direct
+# source execution needs these Python deps available before pretrain_gpt.py
+# starts.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six regex
+import regex
+import six
+print(f"six available: {six.__version__}")
+print(f"regex available: {regex.__version__}")
+PY
+
+CHECKPOINT_DIR=${OUTPUT_DIR}/checkpoints
+TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
+mkdir -p "${CHECKPOINT_DIR}" "${TENSORBOARD_DIR}" "${DATA_CACHE_PATH}" /tmp/checkpoints
+
+echo "Using Megatron-LM-FL repo: ${MCORE_REPO_URL}"
+echo "Using Megatron-LM-FL ref: ${MCORE_REF}"
+git -C "${MCORE_PATH}" rev-parse --short HEAD
 
-# Megatron-LM invocation
+# Megatron-LM-FL invocation. Keep the argument shape aligned with the
+# previously validated tp1/pp1 mock-data GPT functional case while letting CI
+# exit after a few steps.
 COMMAND="
 NVTE_TORCH_COMPILE=0
 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-NVTE_FLASH_ATTN=1
-NVTE_FWD_LAYERNORM_SM_MARGIN=0
-NVTE_BWD_LAYERNORM_SM_MARGIN=0
 CUDA_DEVICE_MAX_CONNECTIONS=1
-NVTE_BIAS_GELU_NVFUSION=0
-NVTE_BIAS_DROPOUT_FUSION=0
+NCCL_ALGO=Ring
+CUBLAS_WORKSPACE_CONFIG=:4096:8
 
-python3
--m torch.distributed.launch
---use_env
+torchrun
 --nnodes=1
 --nproc_per_node=1
 
 ${MCORE_PATH}/pretrain_gpt.py
 --tensor-model-parallel-size 1
 --pipeline-model-parallel-size 1
---use-cpu-initialization
---num-layers 2
---hidden-size 128
+--num-layers 12
+--hidden-size 512
 --num-attention-heads 8
---seq-length 128
---max-position-embeddings 128
---micro-batch-size 1
---global-batch-size 8
---train-iters 10
+--log-params-norm
+--log-num-zeros-in-grad
+--log-validation-ppl-to-tensorboard
+--log-timers-to-tensorboard
+--seq-length 1024
+--max-position-embeddings 1024
+--micro-batch-size 4
+--global-batch-size 32
+--train-iters 50
 --eval-iters 10
---lr 1e-4
+--timing-log-level 0
+--lr-decay-iters 320000
+--save ${CHECKPOINT_DIR}
+--split 949,50,1
+--tokenizer-type NullTokenizer
+--vocab-size 8192
 --mock-data
---vocab-file ${VOCAB_FILE}
---merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--distributed-backend nccl
+--lr 0.00015
+--lr-decay-style cosine
+--min-lr 1.0e-5
+--weight-decay 1e-2
+--clip-grad 1.0
+--lr-warmup-fraction .01
+--log-interval 1
+--save-interval 10000
+--eval-interval 1000
 --transformer-impl transformer_engine
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--deterministic-mode
+--no-gradient-accumulation-fusion
+--attention-softmax-in-fp32
+--use-mcore-models
+--ckpt-format torch_dist
+--dist-ckpt-optim-fully-reshardable
+--dist-ckpt-strictness log_all
+--data-cache-path ${DATA_CACHE_PATH}
+--bf16
+--attention-backend unfused
+--log-memory-to-tensorboard
+--tensorboard-dir ${TENSORBOARD_DIR}
+--exit-interval 4
 ${WITH_FP8:+--fp8-format hybrid}
 "
 COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
 
-# Launch Megatron-LM
+# Launch Megatron-LM-FL
 bash -c "${COMMAND}"
diff --git a/qa/L1_pytorch_mcore_integration/test_bak.sh b/qa/L1_pytorch_mcore_integration/test_bak.sh
new file mode 100644
index 0000000000..ec0b47b695
--- /dev/null
+++ b/qa/L1_pytorch_mcore_integration/test_bak.sh
@@ -0,0 +1,79 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Paths
+: ${TE_PATH:=/opt/transformerengine}
+: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+
+# Check whether FP8 is supported
+DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
+if [[ ${DEVICE_ARCH} -ge 89 ]]; then
+    WITH_FP8=1
+fi
+
+# Download Megatron-LM if needed
+if [ ! -d "${MCORE_PATH}" ]; then
+    pushd $(dirname ${MCORE_PATH})
+    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    popd
+fi
+
+# Megatron tokenizer import chain pulls in bert_tokenization at module import
+# time, which unconditionally depends on `six`.
+python3 - <<'PY' || python3 -m pip install --disable-pip-version-check six
+import six
+print(f"six available: {six.__version__}")
+PY
+
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
+# Megatron-LM invocation
+COMMAND="
+NVTE_TORCH_COMPILE=0
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+NVTE_FLASH_ATTN=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+CUDA_DEVICE_MAX_CONNECTIONS=1
+NVTE_BIAS_GELU_NVFUSION=0
+NVTE_BIAS_DROPOUT_FUSION=0
+
+python3
+-m torch.distributed.launch
+--use_env
+--nnodes=1
+--nproc_per_node=1
+
+${MCORE_PATH}/pretrain_gpt.py
+--tensor-model-parallel-size 1
+--pipeline-model-parallel-size 1
+--use-cpu-initialization
+--num-layers 2
+--hidden-size 128
+--num-attention-heads 8
+--seq-length 128
+--max-position-embeddings 128
+--micro-batch-size 1
+--global-batch-size 8
+--train-iters 10
+--eval-iters 10
+--lr 1e-4
+--mock-data
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
+--transformer-impl transformer_engine
+${WITH_FP8:+--fp8-format hybrid}
+"
+COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
+
+# Launch Megatron-LM
+bash -c "${COMMAND}"
diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index 0386a4adb1..f8313cf78d 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -14,7 +14,6 @@ def _load_cuda_libs():
     import subprocess
     from pathlib import Path
     import importlib.util
-    import sysconfig
     import platform
     import glob as glob_module
 
@@ -154,7 +153,9 @@ def get_attention_backend(self, attention_params=None):
                      fused_attention_backend, use_unfused_attention, available_backends)
         """
         # Import the original get_attention_backend function
-        from transformer_engine.pytorch.attention.dot_product_attention import utils as dpa_utils
+        from transformer_engine.pytorch.attention.dot_product_attention import (
+            utils as dpa_utils,
+        )
 
         return dpa_utils._original_get_attention_backend(attention_params)
 
@@ -580,7 +581,15 @@ def layernorm_fwd(
         tex = self._get_tex()
         otype = tex.DType(int(otype)) if otype is not None else None
         return tex.layernorm_fwd(
-            input, weight, bias, eps, ln_out, quantizer, otype, sm_margin, zero_centered_gamma
+            input,
+            weight,
+            bias,
+            eps,
+            ln_out,
+            quantizer,
+            otype,
+            sm_margin,
+            zero_centered_gamma,
         )
 
     def layernorm_bwd(
@@ -856,7 +865,12 @@ def fused_amax_and_scale_update_after_reduction(
         tex = self._get_tex()
         fp8_dtype = tex.DType(int(fp8_dtype)) if fp8_dtype is not None else None
         return tex.fused_amax_and_scale_update_after_reduction(
-            amax_reduction_buffer, amax_histories, scales, amax_compute_algo, fp8_dtype, margin
+            amax_reduction_buffer,
+            amax_histories,
+            scales,
+            amax_compute_algo,
+            fp8_dtype,
+            margin,
         )
 
     def fp8_block_scaling_compute_partial_amax(
@@ -1294,7 +1308,14 @@ def fused_rope_forward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_forward(
-            input, freqs, start_positions, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            input,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_rope_backward(
@@ -1559,7 +1580,13 @@ def thd_out_correction(
     ) -> None:
         tex = self._get_tex()
         return tex.thd_out_correction(
-            out, out_per_step, lse, lse_per_step, cu_seqlens, only_second_half, lse_packed
+            out,
+            out_per_step,
+            lse,
+            lse_per_step,
+            cu_seqlens,
+            only_second_half,
+            lse_packed,
         )
 
     def thd_grad_correction(

From 36af46aa8fff52a71e4fdfa7c981fc204e073302 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Sat, 9 May 2026 15:23:55 +0800
Subject: [PATCH 53/59] chore: remove SYNC_POINT.md (intermediate sync record,
 not needed on main)

---
 SYNC_POINT.md | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 SYNC_POINT.md

diff --git a/SYNC_POINT.md b/SYNC_POINT.md
deleted file mode 100644
index 7e7cfce0d5..0000000000
--- a/SYNC_POINT.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Upstream Sync Point
-- Upstream: Nvidia/TransformerEngine
-- Branch: release_v2.14
-- Commit SHA: f031cf87bd054c7558b887df7bed93975456667f
-- Sync Date: 2026-04-10
-- Synced By: lixianduo

From e2812ae6e689b18cd76d8e1ae27361fd8a400473 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Sat, 9 May 2026 17:57:09 +0800
Subject: [PATCH 54/59] fix commit init

---
 transformer_engine/common/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index fe933e0191..5264938059 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -191,6 +191,11 @@ def load_framework_extension(framework: str) -> None:
     # For jax: load the native module as before
     module_name = f"transformer_engine_{framework}"
 
+    # Name of the pip extra dependency for framework extensions from PyPI.
+    extra_dep_name = module_name
+    if framework == "torch":
+        extra_dep_name = "pytorch"
+
     # Skip if already loaded
     if module_name in sys.modules:
         return

From 7b331444a971f05db20ffd9b4e9dcd091eed263d Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Sat, 9 May 2026 20:44:34 +0800
Subject: [PATCH 55/59] Fix pylint errors: remove unused imports and correct
 import order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove unused imports in utils.py, multi_head_attention.py, float8_blockwise_tensor.py
- Reorder imports to follow stdlib → third-party → first-party → local convention
- Fixes CI lint failures while maintaining 10.00/10 pylint score

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 transformer_engine/pytorch/attention/multi_head_attention.py   | 1 -
 transformer_engine/pytorch/ops/basic/grouped_linear.py         | 2 +-
 transformer_engine/pytorch/ops/basic/swiglu.py                 | 2 +-
 transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py    | 2 +-
 transformer_engine/pytorch/tensor/float8_blockwise_tensor.py   | 1 -
 transformer_engine/pytorch/tensor/grouped_tensor.py            | 2 +-
 .../pytorch/tensor/storage/grouped_tensor_storage.py           | 3 ++-
 transformer_engine/pytorch/utils.py                            | 1 -
 8 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py
index ec1a7520d4..5864f7eff0 100644
--- a/transformer_engine/pytorch/attention/multi_head_attention.py
+++ b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -9,7 +9,6 @@
 import torch
 
 from transformer_engine import te_device_type
-from transformer_engine.debug.pytorch.debug_state import TEDebugState
 from transformer_engine.pytorch.quantization import FP8GlobalStateManager
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor
 from transformer_engine.pytorch.module.base import TransformerEngineBaseModule
diff --git a/transformer_engine/pytorch/ops/basic/grouped_linear.py b/transformer_engine/pytorch/ops/basic/grouped_linear.py
index c9acbb7a53..0b67dca03b 100644
--- a/transformer_engine/pytorch/ops/basic/grouped_linear.py
+++ b/transformer_engine/pytorch/ops/basic/grouped_linear.py
@@ -12,9 +12,9 @@
 from typing import Any, Optional
 
 import torch
+import transformer_engine_torch as tex
 
 from transformer_engine import te_device_type
-import transformer_engine_torch as tex
 from ...cpp_extensions import general_grouped_gemm
 from ...distributed import CudaRNGStatesTracker
 from ...module._common import WeightGradStore
diff --git a/transformer_engine/pytorch/ops/basic/swiglu.py b/transformer_engine/pytorch/ops/basic/swiglu.py
index caecc03b30..c06c2d4c85 100644
--- a/transformer_engine/pytorch/ops/basic/swiglu.py
+++ b/transformer_engine/pytorch/ops/basic/swiglu.py
@@ -9,9 +9,9 @@
 from typing import Any, Optional
 
 import torch
+import transformer_engine_torch as tex
 
 from transformer_engine import te_device_type
-import transformer_engine_torch as tex
 from ...cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...tensor import Float8CurrentScalingQuantizer, Quantizer
 from ...utils import clear_tensor_data
diff --git a/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py b/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
index c096d229af..8f5a53bf2c 100644
--- a/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
+++ b/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
@@ -12,9 +12,9 @@
 from typing import Any, Optional
 
 import torch
+import transformer_engine_torch as tex
 
 from transformer_engine import te_device_type
-import transformer_engine_torch as tex
 from ...quantization import Recipe
 from ...tensor import Quantizer
 from ...utils import get_cached_ones_tensor, get_device_compute_capability, mark_grouped_tensor
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
index dfcae153e6..f584a6c2db 100644
--- a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -13,7 +13,6 @@
 
 import transformer_engine_torch as tex
 from transformer_engine_torch import DType as TE_DType
-from transformer_engine_torch import Float8BlockScaleTensorFormat
 
 from transformer_engine import te_device_type
 from transformer_engine.common.recipe import Float8BlockScaling, Recipe
diff --git a/transformer_engine/pytorch/tensor/grouped_tensor.py b/transformer_engine/pytorch/tensor/grouped_tensor.py
index ffd179b6fe..a02e8b9754 100644
--- a/transformer_engine/pytorch/tensor/grouped_tensor.py
+++ b/transformer_engine/pytorch/tensor/grouped_tensor.py
@@ -10,9 +10,9 @@
 import torch
 from torch.utils._pytree import tree_map
 
+from transformer_engine import te_device_type
 from ..quantized_tensor import QuantizedTensorStorage, Quantizer
 from .storage.grouped_tensor_storage import GroupedTensorStorage
-from transformer_engine import te_device_type
 
 
 def _stride_from_shape(shape: Tuple[int, ...]) -> Tuple[int, ...]:
diff --git a/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py
index 93ed175989..893f0066bc 100644
--- a/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py
+++ b/transformer_engine/pytorch/tensor/storage/grouped_tensor_storage.py
@@ -8,6 +8,8 @@
 import math
 
 import torch
+
+from transformer_engine import te_device_type
 from ...quantized_tensor import QuantizedTensorStorage, Quantizer
 
 from ..mxfp8_tensor import MXFP8Tensor
@@ -18,7 +20,6 @@
 from .mxfp8_tensor_storage import MXFP8TensorStorage
 from .float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage
 from .nvfp4_tensor_storage import NVFP4TensorStorage
-from transformer_engine import te_device_type
 
 
 class GroupedTensorStorage:
diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py
index 4e3109cf89..eecf14d0e1 100644
--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -15,7 +15,6 @@
 from transformer_engine import te_device_type
 
 from .torch_version import torch_version
-from .quantized_tensor import Quantizer
 from ..debug.pytorch.debug_quantization import DebugQuantizedTensor
 
 

From 2c334aeab5d5f1ecc691a54d19fa157a8903353a Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Mon, 11 May 2026 10:23:46 +0800
Subject: [PATCH 56/59] Fix fused_rope_backward: add missing start_positions
 parameter to plugin backends

---
 .../plugin/core/backends/vendor/cuda/cuda.py           | 10 +++++++++-
 .../plugin/core/backends/vendor/hygon/hygon.py         | 10 +++++++++-
 .../plugin/core/backends/vendor/iluvatar/iluvatar.py   | 10 +++++++++-
 .../plugin/core/backends/vendor/metax/metax.py         | 10 +++++++++-
 .../plugin/core/backends/vendor/musa/musa.py           | 10 +++++++++-
 transformer_engine/plugin/core/ops.py                  |  1 +
 6 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
index f8313cf78d..69c64ec0e2 100644
--- a/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
+++ b/transformer_engine/plugin/core/backends/vendor/cuda/cuda.py
@@ -1322,6 +1322,7 @@ def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
         freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
         qkv_format: NVTE_QKV_Format,
         interleaved: bool,
         cu_seqlens: Optional[torch.Tensor],
@@ -1331,7 +1332,14 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_qkv_rope_forward(
diff --git a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
index 5d9e9779ee..c9ea27197c 100644
--- a/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
+++ b/transformer_engine/plugin/core/backends/vendor/hygon/hygon.py
@@ -1155,6 +1155,7 @@ def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
         freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
         qkv_format: NVTE_QKV_Format,
         interleaved: bool,
         cu_seqlens: Optional[torch.Tensor],
@@ -1164,7 +1165,14 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_qkv_rope_forward(
diff --git a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
index 79b891f5b9..bc0dc390c3 100644
--- a/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
+++ b/transformer_engine/plugin/core/backends/vendor/iluvatar/iluvatar.py
@@ -1179,6 +1179,7 @@ def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
         freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
         qkv_format: NVTE_QKV_Format,
         interleaved: bool,
         cu_seqlens: Optional[torch.Tensor],
@@ -1188,7 +1189,14 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_qkv_rope_forward(
diff --git a/transformer_engine/plugin/core/backends/vendor/metax/metax.py b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
index 725899c72b..c61d981826 100644
--- a/transformer_engine/plugin/core/backends/vendor/metax/metax.py
+++ b/transformer_engine/plugin/core/backends/vendor/metax/metax.py
@@ -1136,6 +1136,7 @@ def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
         freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
         qkv_format: NVTE_QKV_Format,
         interleaved: bool,
         cu_seqlens: Optional[torch.Tensor],
@@ -1145,7 +1146,14 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_qkv_rope_forward(
diff --git a/transformer_engine/plugin/core/backends/vendor/musa/musa.py b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
index 77a374ad97..0de9350928 100644
--- a/transformer_engine/plugin/core/backends/vendor/musa/musa.py
+++ b/transformer_engine/plugin/core/backends/vendor/musa/musa.py
@@ -1151,6 +1151,7 @@ def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
         freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
         qkv_format: NVTE_QKV_Format,
         interleaved: bool,
         cu_seqlens: Optional[torch.Tensor],
@@ -1160,7 +1161,14 @@ def fused_rope_backward(
         tex = self._get_tex()
         qkv_format = tex.NVTE_QKV_Format(int(qkv_format)) if qkv_format is not None else None
         return tex.fused_rope_backward(
-            output_grads, freqs, qkv_format, interleaved, cu_seqlens, cp_size, cp_rank
+            output_grads,
+            freqs,
+            start_positions,
+            qkv_format,
+            interleaved,
+            cu_seqlens,
+            cp_size,
+            cp_rank,
         )
 
     def fused_qkv_rope_forward(
diff --git a/transformer_engine/plugin/core/ops.py b/transformer_engine/plugin/core/ops.py
index c3a9f8f176..48292c7b22 100644
--- a/transformer_engine/plugin/core/ops.py
+++ b/transformer_engine/plugin/core/ops.py
@@ -1328,6 +1328,7 @@ def fused_rope_backward(
         self,
         output_grads: torch.Tensor,
         freqs: torch.Tensor,
+        start_positions: Optional[torch.Tensor],
         qkv_format: NVTE_QKV_Format,
         interleaved: bool,
         cu_seqlens: Optional[torch.Tensor],

From 879eddc6dae7d3b7d8216cc0e9f85eb417ff9840 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Tue, 12 May 2026 10:40:27 +0800
Subject: [PATCH 57/59] fix test_numerics unit test

---
 qa/L1_pytorch_distributed_unittest/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 46b54ed30d..0a11a129de 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -157,7 +157,7 @@ run_test_step "pytest_test_cast_master_weights_to_fp8.xml" "$TE_PATH/tests/pytor
 # standard numerics tests with initialized debug
 if [ "$DEBUG_TESTS_READY" -eq 1 ]; then
     run_test_step "pytest_test_numerics_2.xml" "$TE_PATH/tests/pytorch/distributed/test_numerics.py" \
-    "NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
+    "NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py" \
     "test_numerics.py (debug)"
 else
     echo "Skipping debug test_numerics.py because nvdlfw_inspect is unavailable"

From e12589a2ca00a3b0e71d6dd5aeafe49492ba3430 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Tue, 12 May 2026 10:57:26 +0800
Subject: [PATCH 58/59] fix Latex not found errors, use mathjax

---
 docs/Doxyfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Doxyfile b/docs/Doxyfile
index f17ffc297b..7f42e5b0ab 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -1606,7 +1606,7 @@ FORMULA_MACROFILE      =
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-USE_MATHJAX            = NO
+USE_MATHJAX            = YES
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:

From e5c838010476e55713382277e05f81df4dde2963 Mon Sep 17 00:00:00 2001
From: lixianduo <lixianduo@flagos.ai>
Date: Tue, 12 May 2026 12:25:51 +0800
Subject: [PATCH 59/59] Fix Sphinx build warnings: suppress autoapi import
 resolution and unknown type warnings

---
 docs/conf.py | 58 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index d2bba9825a..3785072716 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -112,23 +112,51 @@
 
 autoapi_generate_api_docs = False
 autoapi_dirs = [root_path / "transformer_engine"]
-autoapi_ignore = ["*test*"]
+autoapi_ignore = ["*test*", "*/benchmarks/*"]
+
+suppress_warnings = [
+    "autoapi.python_import_resolution",
+    "autoapi",
+]
 
 
 # There are 2 warnings about the same namespace (transformer_engine) in two different c++ api
-# docs pages. This seems to be the only way to suppress these warnings.
+# docs pages, and "Unknown type: placeholder" warnings from autoapi/breathe.
+# Install logging filters at module load time so they catch warnings emitted
+# before setup() is called.
+import logging as _logging
+import warnings as _warnings
+
+_warnings.filterwarnings("ignore", message=".*Unknown type.*placeholder.*")
+
+
+class _KnownWarningFilter(_logging.Filter):
+    def filter(self, record):
+        message = record.getMessage()
+        if "Duplicate C++ declaration" in message and "transformer_engine" in message:
+            return False
+        if "Unknown type" in message and "placeholder" in message:
+            return False
+        return True
+
+
+for _logger_name in ["sphinx", "sphinx.application", "autoapi", ""]:
+    _logging.getLogger(_logger_name).addFilter(_KnownWarningFilter())
+
+
 def setup(app):
     """Custom Sphinx setup to filter warnings."""
-    import logging
-
-    # Filter out duplicate C++ declaration warnings
-    class DuplicateDeclarationFilter(logging.Filter):
-        def filter(self, record):
-            message = record.getMessage()
-            if "Duplicate C++ declaration" in message and "transformer_engine" in message:
-                return False
-            return True
-
-    # Apply filter to Sphinx logger
-    logger = logging.getLogger("sphinx")
-    logger.addFilter(DuplicateDeclarationFilter())
+    import sphinx.util.logging
+
+    # Monkey-patch Sphinx's warning handler to filter known warnings
+    original_warning = sphinx.util.logging.SphinxLoggerAdapter.warning
+
+    def filtered_warning(self, msg, *args, **kwargs):
+        msg_str = str(msg)
+        if "Unknown type" in msg_str and "placeholder" in msg_str:
+            return
+        if "Duplicate C++ declaration" in msg_str and "transformer_engine" in msg_str:
+            return
+        return original_warning(self, msg, *args, **kwargs)
+
+    sphinx.util.logging.SphinxLoggerAdapter.warning = filtered_warning