From b4c4b15a5551586d2baaf115afbfb4488185b860 Mon Sep 17 00:00:00 2001
From: Stardep <146844969+Stardep-lmc@users.noreply.github.com>
Date: Tue, 28 Apr 2026 18:31:36 +0800
Subject: [PATCH 1/2] Add full file specialization guide for Ascend

Document the steps for full file specialization in Ascend, including modifications to compiler, code generator, errors, language modules, and runtime components.
---
 .../FlagTree_Backend_Specialization_Python.md | 639 ++++++++++++++++++
 1 file changed, 639 insertions(+)

diff --git a/documents/decoupling/FlagTree_Backend_Specialization_Python.md b/documents/decoupling/FlagTree_Backend_Specialization_Python.md
index 5c253bd48e..7be43194fc 100644
--- a/documents/decoupling/FlagTree_Backend_Specialization_Python.md
+++ b/documents/decoupling/FlagTree_Backend_Specialization_Python.md
@@ -279,3 +279,642 @@ def isnan(arg0, _builder=None):
             (core.dtype("bf16"),): ("__hmf_isnan", core.dtype("int1")),
         }, is_pure=True, _builder=_builder)
 ```
+
+## 5.整文件特化（以Ascend为例）
+## 5.1 compiler模块
+## 5.1.1 compiler.py
+```text
+third_party/Ascend/backend/spec/triton/compiler/compiler.py
+```
+第一步：
+主干 `python/triton/compiler/__init__.py` 中有：
+```python
+from triton.runtime.driver import spec_path
+ 
+spec_path(__path__)
+```
+
+第二步：
+在 `python/triton/runtime/driver.py` 中增加`spec_path(path_list)`：
+```python
+def spec_path(path_list: list):
+    import os
+    if not path_list:
+        return
+    current_path = path_list[0].replace(os.sep, "/")
+    marker = "/triton/"
+    idx = current_path.find(marker)
+    if idx == -1:
+        return
+    triton_root = current_path[:idx + len("/triton")]
+    rel_path = current_path[idx + len(marker):]
+    backend_path = os.path.join(triton_root, "backends", "ascend", "spec", "triton", rel_path)
+    if os.path.isdir(backend_path):
+        path_list.insert(0, backend_path)
+```
+
+第三步：
+将 `python/triton/compiler/compiler.py` 中的：
+```python
+from .._C.libtriton import get_cache_invalidating_env_vars, ir
+```
+
+改成：
+```python
+from .._C.libtriton import get_cache_invalidating_env_vars, ir, buffer_ir
+from .._C.libtriton.ascend import ir as ascend_ir
+```
+
+第四步：
+在 `compile()` 主流程中补充 dialect 加载：
+```python
+context = ir.context()
+ir.load_dialects(context)
+buffer_ir.load_dialects(context)
+ascend_ir.load_dialects(context)
+backend.load_dialects(context)
+```
+
+第五步：
+在 `compile()` 主流程中生成异常信息
+```python
+    for ext, compile_ir in list(stages.items())[first_stage:]:
+        try:
+            next_module = compile_ir(module, metadata)
+        except Exception as e:
+            if (ext == "ttadapter"):
+                stage_name = "ConvertTritonIRToLinalgIR"
+            elif (ext == "npubin"):
+                stage_name = "ConvertLinalgRToBinary"
+            else:
+                stage_name = "MLIRCompile"
+            error_detail = e.stderr.decode('utf-8') if hasattr(e, 'stderr') and e.stderr else str(e)
+            error_detail += f"\n\n[INFO]: The compiled kernel cache is in {fn_cache_manager.cache_dir}\n\n"
+            raise MLIRCompilationError(stage_name, error_detail) from e
+        ir_filename = f"{file_name}.{ext}"
+        if (fn_override_manager is not None and (full_name := fn_override_manager.get_file(ir_filename)) is not None):
+```
+
+注意事项：
+主干中有：
+```python
+from .code_generator import ast_to_ttir
+from .errors import MLIRCompilationError
+```
+继续特化
+- `code_generator.py`
+- `errors.py`
+
+## 5.1.2 code_generator.py
+```text
+third_party/ascend/backend/spec/triton/compiler/code_generator.py
+```
+第一步：
+增加 Ascend / buffer 相关 import：
+```python
+import triton.language.extra.cann.extension as extension
+from triton.extension.buffer.language import core as bl
+from triton.extension.buffer.language.builder import setup_unified_builder_with_buffer_builder
+from .._C.libtriton import ir, buffer_ir
+from .._C.libtriton.ascend import ir as ascend_ir
+from triton.language.extra.cann.extension.dispatch import ASCEND_WITH_DISPATCH
+from triton.language.extra.cann.extension.builder import setup_unified_builder
+```
+
+第二步：
+增加 `WITH_DISPATCH` 注册逻辑：
+```python
+WITH_DISPATCH = {}
+WITH_DISPATCH.update(ASCEND_WITH_DISPATCH)
+```
+
+第三步：
+将：
+```python
+def mangle_ty(ty):
+    ...
+```
+
+改成支持后端覆盖：
+```python
+mangle_ty = WITH_DISPATCH.get("mangle_ty", mangle_ty)
+```
+
+第四步：
+在 `CodeGenerator.__init__(...)` 中，除了主干的 `ir.builder(...)` 之外，补充：
+```python
+self.ascend_builder = ascend_ir.ascendnpu_ir_builder(context, getattr(options, "arch", ""))
+self.ascend_builder.set_loc(file_name, begin_line, 0)
+setup_unified_builder(self.builder, self.ascend_builder)
+
+self.buffer_builder = buffer_ir.buffer_builder(context)
+self.buffer_builder.set_loc(file_name, begin_line, 0)
+setup_unified_builder_with_buffer_builder(self.builder, self.buffer_builder)
+```
+
+第五步：
+在CodeGenerator中增加：visit_With()
+```python
+    def visit_With(self, node):
+        """Handle 'with' statements using dispatch pattern."""
+        assert len(node.items) == 1
+        context = node.items[0].context_expr
+
+        # Check if context is a Call and dispatch to registered handler
+        if isinstance(context, ast.Call):
+            withitemClass = self.visit(context.func)
+            handler = WITH_DISPATCH.get(withitemClass)
+            if handler:
+                return handler(self, node)
+
+        # Fall back to visiting body for unhandled cases
+        return self.visit_compound_statement(node.body)
+```
+
+第六步：
+在 `visit_For()` 中支持 Ascend 扩展 iterator，例如：
+```python
+if IteratorClass in [language.range, extension.parallel, tle.dsa.pipeline, tle.dsa.parallel]:
+    ...
+```
+
+第七步：
+在参数封装时支持 buffer 类型：
+```python
+if isinstance(param_type, bl.buffer_type):
+    arg_values.append(bl.buffer(self.fn.args(idx), param_type))
+else:
+    arg_values.append(tensor(self.fn.args(idx), param_type))
+```
+
+注意事项：
+这个文件一旦特化，通常意味着 AST -> TTIR 前端语义已经变化，因此它往往与：
+- `compiler.py`
+- `language/core.py`
+- `language/semantic.py`
+
+这些文件存在联动关系。
+
+## 5.1.3 errors.py
+```text
+third_party/ascend/backend/spec/triton/compiler/errors.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/compiler/errors.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/compiler/errors.py
+```
+
+第二步：
+保留主干中原有错误类型：
+- `CompilationError`
+- `CompileTimeAssertionFailure`
+- `UnsupportedLanguageConstruct`
+
+第三步：
+补充 Ascend 编译链专用错误类型：
+```python
+class MLIRCompilationError(TritonError):
+    ...
+```
+
+第四步：
+在该错误类型中增加对 stage 名称的包装，例如：
+```python
+self.stage_name = stage_name
+```
+
+第五步：
+增加对错误信息的过滤，例如：
+```python
+def filter_message(self, message):
+    return message.split("Stack dump without symbol names")[0]
+```
+
+第六步：
+增加格式化输出，用于把 backend 编译错误按统一格式打印：
+```python
+def format_line_delim(self, keyword):
+    return f"///------------------{keyword}------------------\\n"
+```
+
+注意事项：
+只要 `compiler.py` 中出现：
+```python
+from .errors import MLIRCompilationError
+```
+
+那么 `errors.py` 就不再能直接复用主干文件，而必须一起特化。
+
+## 5.2 language模块
+## 5.2.1 core.py
+```text
+third_party/ascend/backend/spec/triton/language/core.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/language/core.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/language/core.py
+```
+第二步：
+检查 `code_generator.py` 是否已经对 `core.py` 提出了新能力要求。
+
+对于 Ascend，`code_generator.py` 已经引入了：
+- buffer 类型支持；
+- Ascend 扩展 builder；
+- 扩展 `with` 语法；
+- 新 iterator / 新原语；
+
+因此 `core.py` 需要继续提供与这些前端行为匹配的基础原语与类型支持。
+
+第三步：
+扩展 `_utils` 依赖。
+
+主干中有：
+```python
+from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape
+```
+
+Ascend 版改成：
+```python
+from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth
+```
+
+这说明：
+
+> `core.py` 已经开始依赖新的基础工具函数，因此 `_utils.py` 也属于潜在联动特化文件。
+
+第四步：
+在 `core.py` 中补充 Ascend 需要暴露给上层 `triton.language` 的基础原语。
+
+从 Ascend 当前实现和 `language/__init__.py` 的导出关系看，后续通常会由 `core.py` 承载：
+- `make_tensor_descriptor`
+- `load_tensor_descriptor`
+- `store_tensor_descriptor`
+- `gather`
+- 
+第五步：
+确认这些新增原语是否需要：
+- `@builtin`
+- `@_tensor_member_fn`
+- `_builder` 参数
+
+因为一旦 `code_generator.py` 通过 `visit_Call()` / `visit_With()` 调到这些接口，它们就必须符合 Triton builtin 的调用契约。
+
+注意事项：
+如果 `core.py` 里新增了主干没有的基础原语，那么通常还要继续联动：
+- `language/__init__.py`（负责导出）
+- `semantic.py`（负责这些原语底层语义实现）
+
+## 5.2.2 semantic.py
+```text
+third_party/ascend/backend/spec/triton/language/semantic.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/language/semantic.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/language/semantic.py
+```
+
+第二步：
+补充 Ascend 环境相关依赖。
+
+Ascend 版增加了：
+```python
+from . import is_compile_on_910_95
+```
+
+这说明 `semantic.py` 已经开始依赖 Ascend 环境状态，而不再只是主干纯通用语义。
+
+第三步：
+检查 `code_generator.py` / `core.py` 新增的语言原语，是否需要在 `semantic.py` 中提供底层语义支撑。
+
+典型包括：
+- descriptor 类操作；
+- gather 类操作；
+- buffer / address space / tensor kind 相关语义；
+- 针对 Ascend 芯片型号的条件逻辑。
+
+第四步：
+如果后端需要根据芯片型号切换语义分支，则在 `semantic.py` 中加入类似：
+```python
+from . import is_compile_on_910_95
+```
+并在具体语义实现里基于该变量分流。
+
+## 5.2.3 standard.py
+```text
+third_party/ascend/backend/spec/triton/language/standard.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/language/standard.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/language/standard.py
+```
+
+第二步：
+检查是否需要调整标准库函数的默认行为。
+
+例如 Ascend 版 `ravel`：
+```python
+return core.reshape(x, [x.numel], can_reorder=False)
+```
+
+而主干中是：
+```python
+return core.reshape(x, [x.numel], can_reorder=True)
+```
+
+第四步：
+如果后端需要新增标准库层公开接口，例如 `topk`，则应在 `standard.py` 中实现，并准备由：
+- `language/__init__.py`
+统一导出到 `triton.language` 顶层。
+
+## 5.2.4 math.py
+```text
+third_party/ascend/backend/spec/triton/language/math.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/language/math.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/language/math.py
+```
+
+第二步：
+检查 dtype 支持范围是否需要扩大。
+
+例如主干 `exp` 的 dtype 检查是：
+```python
+@_check_dtype(dtypes=["fp32", "fp64"])
+```
+
+Ascend 版放宽成：
+```python
+@_check_dtype(dtypes=["bf16", "fp16", "fp32", "fp8e4nv", "fp8e5", "fp64"])
+```
+
+第三步：
+如果后端对 bool/int1 有特殊表示，则要修改 `_check_dtype(...)`。
+
+Ascend 版增加了：
+```python
+if hasattr(arg, 'was_bool_to_int8') and arg.was_bool_to_int8:
+    arg_type = 'int1'
+```
+
+第四步：
+检查主干数学原语是否需要新增后端实现，或者替换到底层 libdevice / builder。
+
+典型包括：
+- `exp`
+- `exp2`
+- `log`
+- `log2`
+- `cos`
+
+这些函数名不变，但后端可能会改变：
+- 接受的 dtype；
+- 具体 builder 行为；
+- libdevice 映射。
+
+## 5.2.5 language/__init__.py
+```text
+third_party/ascend/backend/spec/triton/language/__init__.py
+```
+
+第一步：
+根据后端需要，增加语言层全局变量注入函数，例如：
+```python
+def language_extend_globals(globals_dict):
+    ...
+```
+
+第二步：
+将后端新增原语导出到 `triton.language` 顶层，例如：
+```python
+def language_extend_exports(globals_dict, all_list):
+    from triton.language.core import make_tensor_descriptor, load_tensor_descriptor, store_tensor_descriptor, gather
+    from triton.language.standard import topk
+    ...
+```
+
+# 5.2.6 _utils.py
+```text
+third_party/ascend/backend/spec/triton/language/_utils.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/language/_utils.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/language/_utils.py
+```
+
+第二步：
+检查 `core.py` 是否已经引入了新的 `_utils` 依赖。
+
+对于 Ascend，主干中是：
+```python
+from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape
+```
+
+而 Ascend 版 `core.py` 已经变成：
+```python
+from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth
+```
+
+第三步：
+按后端需求调整 `validate_block_shape(...)` 之类的基础检查规则。
+
+例如主干中有：
+```python
+if not is_power_of_two(d):
+    raise ValueError(...)
+```
+
+第四步：
+在 `_utils.py` 中补充后端需要的 dtype / bitwidth / canonicalize 工具，例如：
+- `type_canonicalisation_dict`
+- `canonicalize_dtype(dtype)`
+- `BITWIDTH_DICT`
+- `get_primitive_bitwidth(dtype)`
+
+## 5.3 runtime模块
+- 如果后端已经对：
+  - kernel 启动路径；
+  - autotune 配置；
+  - 本地缓存路径；
+  - library entry；
+  - 解释执行；
+
+  这些行为提出了新要求，就要继续联动到 runtime。
+
+## 5.3.1 jit.py
+```text
+third_party/ascend/backend/spec/triton/runtime/jit.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/runtime/jit.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/runtime/jit.py
+```
+
+第二步：
+检查 `compiler.py` 的整文件特化是否已经改变了 JIT 编译入口依赖。
+
+如果 `jit.py` 中会直接调用：
+- `compile`
+- `ASTSource`
+- `make_backend`
+
+第三步：
+检查 `run()` 流程中的：
+- signature 推导；
+- constants 组织；
+- options 生成；
+- cache key；
+- kernel 启动参数；
+是否还与主干兼容。
+
+第四步：
+如果后端对 `repr`、launch options、autotune metadata、grid 组织有差异，则在 `jit.py` 中补充对应逻辑。
+
+## 5.3.2 autotuner.py
+```text
+third_party/ascend/backend/spec/triton/runtime/autotuner.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/runtime/autotuner.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/runtime/autotuner.py
+```
+
+第二步：
+检查后端对 autotune config 的默认值是否有差异。
+
+第三步：
+检查后端是否需要不同的 benchmark / pre_hook / post_hook / prune 行为。
+
+第四步：
+如果 `jit.py` 已经改变了 kernel 启动和 config 消费方式，则 `autotuner.py` 也应联动调整。
+
+## 5.3.3 interpreter.py
+```text
+third_party/ascend/backend/spec/triton/runtime/interpreter.py
+```
+
+第一步：
+将主干文件：
+```text
+python/triton/runtime/interpreter.py
+```
+
+复制到：
+```text
+third_party/ascend/backend/spec/triton/runtime/interpreter.py
+```
+
+第二步：
+检查后端是否对解释执行的数据类型、pointer/materialize 行为有不同要求。
+
+第三步：
+如果 language 层已经引入：
+- 新 dtype；
+- 新 pointer 语义；
+- 新 builtin；
+
+则 interpreter 往往要一起补这些行为。
+
+## 5.3.4 code_cache.py
+```text
+third_party/ascend/backend/spec/triton/runtime/code_cache.py
+```
+
+第一步：
+如果后端不想复用主干 `runtime/cache.py` 的缓存目录组织方式，就新增自己的缓存模块。
+
+Ascend 当前提供的是：
+```text
+third_party/ascend/backend/spec/triton/runtime/code_cache.py
+```
+
+第二步：
+定义后端自己的缓存根目录，例如：
+- `FLAGGEMS_CACHE_DIR`
+- `.flaggems`
+
+第三步：
+根据后端需求拆分：
+- `code_cache_dir()`
+- `config_cache_dir()`
+- `clear_cache()`
+
+## 5.3.5 libentry.py
+```text
+third_party/ascend/backend/spec/triton/runtime/libentry.py
+```
+
+第一步：
+如果后端需要额外的 library entry / autotune 数据库存储 / torch 设备适配逻辑，可以在 runtime 下新增：
+```text
+libentry.py
+```
+
+第二步：
+接入后端设备接口，例如 Ascend 版中：
+```python
+import torch_npu
+torch_device_fn = torch.npu
+```
+
+第三步：
+如果后端需要独立维护 tuned config 数据库、kernel cache 入口、library kernel 启动接口，可在该文件中实现。
+
+注意事项：
+`libentry.py` 不一定在每个后端都需要；它更像是 runtime 子体系里的后端新增模块，而不是所有后端都必须照抄的固定文件。

From 2788eeb1f9f98b9cb71d6223aaad89c5875b84f3 Mon Sep 17 00:00:00 2001
From: flagtree-bot <flagtree_ai@163.com>
Date: Tue, 28 Apr 2026 10:34:29 +0000
Subject: [PATCH 2/2] Apply code-format changes

---
 .../decoupling/FlagTree_Backend_Specialization_Python.md      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/documents/decoupling/FlagTree_Backend_Specialization_Python.md b/documents/decoupling/FlagTree_Backend_Specialization_Python.md
index 7be43194fc..68ae353dda 100644
--- a/documents/decoupling/FlagTree_Backend_Specialization_Python.md
+++ b/documents/decoupling/FlagTree_Backend_Specialization_Python.md
@@ -290,7 +290,7 @@ third_party/Ascend/backend/spec/triton/compiler/compiler.py
 主干 `python/triton/compiler/__init__.py` 中有：
 ```python
 from triton.runtime.driver import spec_path
- 
+
 spec_path(__path__)
 ```
 
@@ -564,7 +564,7 @@ from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive
 - `load_tensor_descriptor`
 - `store_tensor_descriptor`
 - `gather`
-- 
+-
 第五步：
 确认这些新增原语是否需要：
 - `@builtin`