From b4c4b15a5551586d2baaf115afbfb4488185b860 Mon Sep 17 00:00:00 2001 From: Stardep <146844969+Stardep-lmc@users.noreply.github.com> Date: Tue, 28 Apr 2026 18:31:36 +0800 Subject: [PATCH 1/2] Add full file specialization guide for Ascend Document the steps for full file specialization in Ascend, including modifications to compiler, code generator, errors, language modules, and runtime components. --- .../FlagTree_Backend_Specialization_Python.md | 639 ++++++++++++++++++ 1 file changed, 639 insertions(+) diff --git a/documents/decoupling/FlagTree_Backend_Specialization_Python.md b/documents/decoupling/FlagTree_Backend_Specialization_Python.md index 5c253bd48e..7be43194fc 100644 --- a/documents/decoupling/FlagTree_Backend_Specialization_Python.md +++ b/documents/decoupling/FlagTree_Backend_Specialization_Python.md @@ -279,3 +279,642 @@ def isnan(arg0, _builder=None): (core.dtype("bf16"),): ("__hmf_isnan", core.dtype("int1")), }, is_pure=True, _builder=_builder) ``` + +## 5.整文件特化(以Ascend为例) +## 5.1 compiler模块 +## 5.1.1 compiler.py +```text +third_party/Ascend/backend/spec/triton/compiler/compiler.py +``` +第一步: +主干 `python/triton/compiler/__init__.py` 中有: +```python +from triton.runtime.driver import spec_path + +spec_path(__path__) +``` + +第二步: +在 `python/triton/runtime/driver.py` 中增加`spec_path(path_list)`: +```python +def spec_path(path_list: list): + import os + if not path_list: + return + current_path = path_list[0].replace(os.sep, "/") + marker = "/triton/" + idx = current_path.find(marker) + if idx == -1: + return + triton_root = current_path[:idx + len("/triton")] + rel_path = current_path[idx + len(marker):] + backend_path = os.path.join(triton_root, "backends", "ascend", "spec", "triton", rel_path) + if os.path.isdir(backend_path): + path_list.insert(0, backend_path) +``` + +第三步: +将 `python/triton/compiler/compiler.py` 中的: +```python +from .._C.libtriton import get_cache_invalidating_env_vars, ir +``` + +改成: +```python +from .._C.libtriton import get_cache_invalidating_env_vars, ir, buffer_ir +from .._C.libtriton.ascend import ir as ascend_ir +``` + +第四步: +在 `compile()` 主流程中补充 dialect 加载: +```python +context = ir.context() +ir.load_dialects(context) +buffer_ir.load_dialects(context) +ascend_ir.load_dialects(context) +backend.load_dialects(context) +``` + +第五步: +在 `compile()` 主流程中生成异常信息 +```python + for ext, compile_ir in list(stages.items())[first_stage:]: + try: + next_module = compile_ir(module, metadata) + except Exception as e: + if (ext == "ttadapter"): + stage_name = "ConvertTritonIRToLinalgIR" + elif (ext == "npubin"): + stage_name = "ConvertLinalgRToBinary" + else: + stage_name = "MLIRCompile" + error_detail = e.stderr.decode('utf-8') if hasattr(e, 'stderr') and e.stderr else str(e) + error_detail += f"\n\n[INFO]: The compiled kernel cache is in {fn_cache_manager.cache_dir}\n\n" + raise MLIRCompilationError(stage_name, error_detail) from e + ir_filename = f"{file_name}.{ext}" + if (fn_override_manager is not None and (full_name := fn_override_manager.get_file(ir_filename)) is not None): +``` + +注意事项: +主干中有: +```python +from .code_generator import ast_to_ttir +from .errors import MLIRCompilationError +``` +继续特化 +- `code_generator.py` +- `errors.py` + +## 5.1.2 code_generator.py +```text +third_party/ascend/backend/spec/triton/compiler/code_generator.py +``` +第一步: +增加 Ascend / buffer 相关 import: +```python +import triton.language.extra.cann.extension as extension +from triton.extension.buffer.language import core as bl +from triton.extension.buffer.language.builder import setup_unified_builder_with_buffer_builder +from .._C.libtriton import ir, buffer_ir +from .._C.libtriton.ascend import ir as ascend_ir +from triton.language.extra.cann.extension.dispatch import ASCEND_WITH_DISPATCH +from triton.language.extra.cann.extension.builder import setup_unified_builder +``` + +第二步: +增加 `WITH_DISPATCH` 注册逻辑: +```python +WITH_DISPATCH = {} +WITH_DISPATCH.update(ASCEND_WITH_DISPATCH) +``` + +第三步: +将: +```python +def mangle_ty(ty): + ... +``` + +改成支持后端覆盖: +```python +mangle_ty = WITH_DISPATCH.get("mangle_ty", mangle_ty) +``` + +第四步: +在 `CodeGenerator.__init__(...)` 中,除了主干的 `ir.builder(...)` 之外,补充: +```python +self.ascend_builder = ascend_ir.ascendnpu_ir_builder(context, getattr(options, "arch", "")) +self.ascend_builder.set_loc(file_name, begin_line, 0) +setup_unified_builder(self.builder, self.ascend_builder) + +self.buffer_builder = buffer_ir.buffer_builder(context) +self.buffer_builder.set_loc(file_name, begin_line, 0) +setup_unified_builder_with_buffer_builder(self.builder, self.buffer_builder) +``` + +第五步: +在CodeGenerator中增加:visit_With() +```python + def visit_With(self, node): + """Handle 'with' statements using dispatch pattern.""" + assert len(node.items) == 1 + context = node.items[0].context_expr + + # Check if context is a Call and dispatch to registered handler + if isinstance(context, ast.Call): + withitemClass = self.visit(context.func) + handler = WITH_DISPATCH.get(withitemClass) + if handler: + return handler(self, node) + + # Fall back to visiting body for unhandled cases + return self.visit_compound_statement(node.body) +``` + +第六步: +在 `visit_For()` 中支持 Ascend 扩展 iterator,例如: +```python +if IteratorClass in [language.range, extension.parallel, tle.dsa.pipeline, tle.dsa.parallel]: + ... +``` + +第七步: +在参数封装时支持 buffer 类型: +```python +if isinstance(param_type, bl.buffer_type): + arg_values.append(bl.buffer(self.fn.args(idx), param_type)) +else: + arg_values.append(tensor(self.fn.args(idx), param_type)) +``` + +注意事项: +这个文件一旦特化,通常意味着 AST -> TTIR 前端语义已经变化,因此它往往与: +- `compiler.py` +- `language/core.py` +- `language/semantic.py` + +这些文件存在联动关系。 + +## 5.1.3 errors.py +```text +third_party/ascend/backend/spec/triton/compiler/errors.py +``` + +第一步: +将主干文件: +```text +python/triton/compiler/errors.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/compiler/errors.py +``` + +第二步: +保留主干中原有错误类型: +- `CompilationError` +- `CompileTimeAssertionFailure` +- `UnsupportedLanguageConstruct` + +第三步: +补充 Ascend 编译链专用错误类型: +```python +class MLIRCompilationError(TritonError): + ... +``` + +第四步: +在该错误类型中增加对 stage 名称的包装,例如: +```python +self.stage_name = stage_name +``` + +第五步: +增加对错误信息的过滤,例如: +```python +def filter_message(self, message): + return message.split("Stack dump without symbol names")[0] +``` + +第六步: +增加格式化输出,用于把 backend 编译错误按统一格式打印: +```python +def format_line_delim(self, keyword): + return f"///------------------{keyword}------------------\\n" +``` + +注意事项: +只要 `compiler.py` 中出现: +```python +from .errors import MLIRCompilationError +``` + +那么 `errors.py` 就不再能直接复用主干文件,而必须一起特化。 + +## 5.2 language模块 +## 5.2.1 core.py +```text +third_party/ascend/backend/spec/triton/language/core.py +``` + +第一步: +将主干文件: +```text +python/triton/language/core.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/language/core.py +``` +第二步: +检查 `code_generator.py` 是否已经对 `core.py` 提出了新能力要求。 + +对于 Ascend,`code_generator.py` 已经引入了: +- buffer 类型支持; +- Ascend 扩展 builder; +- 扩展 `with` 语法; +- 新 iterator / 新原语; + +因此 `core.py` 需要继续提供与这些前端行为匹配的基础原语与类型支持。 + +第三步: +扩展 `_utils` 依赖。 + +主干中有: +```python +from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape +``` + +Ascend 版改成: +```python +from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth +``` + +这说明: + +> `core.py` 已经开始依赖新的基础工具函数,因此 `_utils.py` 也属于潜在联动特化文件。 + +第四步: +在 `core.py` 中补充 Ascend 需要暴露给上层 `triton.language` 的基础原语。 + +从 Ascend 当前实现和 `language/__init__.py` 的导出关系看,后续通常会由 `core.py` 承载: +- `make_tensor_descriptor` +- `load_tensor_descriptor` +- `store_tensor_descriptor` +- `gather` +- +第五步: +确认这些新增原语是否需要: +- `@builtin` +- `@_tensor_member_fn` +- `_builder` 参数 + +因为一旦 `code_generator.py` 通过 `visit_Call()` / `visit_With()` 调到这些接口,它们就必须符合 Triton builtin 的调用契约。 + +注意事项: +如果 `core.py` 里新增了主干没有的基础原语,那么通常还要继续联动: +- `language/__init__.py`(负责导出) +- `semantic.py`(负责这些原语底层语义实现) + +## 5.2.2 semantic.py +```text +third_party/ascend/backend/spec/triton/language/semantic.py +``` + +第一步: +将主干文件: +```text +python/triton/language/semantic.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/language/semantic.py +``` + +第二步: +补充 Ascend 环境相关依赖。 + +Ascend 版增加了: +```python +from . import is_compile_on_910_95 +``` + +这说明 `semantic.py` 已经开始依赖 Ascend 环境状态,而不再只是主干纯通用语义。 + +第三步: +检查 `code_generator.py` / `core.py` 新增的语言原语,是否需要在 `semantic.py` 中提供底层语义支撑。 + +典型包括: +- descriptor 类操作; +- gather 类操作; +- buffer / address space / tensor kind 相关语义; +- 针对 Ascend 芯片型号的条件逻辑。 + +第四步: +如果后端需要根据芯片型号切换语义分支,则在 `semantic.py` 中加入类似: +```python +from . import is_compile_on_910_95 +``` +并在具体语义实现里基于该变量分流。 + +## 5.2.3 standard.py +```text +third_party/ascend/backend/spec/triton/language/standard.py +``` + +第一步: +将主干文件: +```text +python/triton/language/standard.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/language/standard.py +``` + +第二步: +检查是否需要调整标准库函数的默认行为。 + +例如 Ascend 版 `ravel`: +```python +return core.reshape(x, [x.numel], can_reorder=False) +``` + +而主干中是: +```python +return core.reshape(x, [x.numel], can_reorder=True) +``` + +第四步: +如果后端需要新增标准库层公开接口,例如 `topk`,则应在 `standard.py` 中实现,并准备由: +- `language/__init__.py` +统一导出到 `triton.language` 顶层。 + +## 5.2.4 math.py +```text +third_party/ascend/backend/spec/triton/language/math.py +``` + +第一步: +将主干文件: +```text +python/triton/language/math.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/language/math.py +``` + +第二步: +检查 dtype 支持范围是否需要扩大。 + +例如主干 `exp` 的 dtype 检查是: +```python +@_check_dtype(dtypes=["fp32", "fp64"]) +``` + +Ascend 版放宽成: +```python +@_check_dtype(dtypes=["bf16", "fp16", "fp32", "fp8e4nv", "fp8e5", "fp64"]) +``` + +第三步: +如果后端对 bool/int1 有特殊表示,则要修改 `_check_dtype(...)`。 + +Ascend 版增加了: +```python +if hasattr(arg, 'was_bool_to_int8') and arg.was_bool_to_int8: + arg_type = 'int1' +``` + +第四步: +检查主干数学原语是否需要新增后端实现,或者替换到底层 libdevice / builder。 + +典型包括: +- `exp` +- `exp2` +- `log` +- `log2` +- `cos` + +这些函数名不变,但后端可能会改变: +- 接受的 dtype; +- 具体 builder 行为; +- libdevice 映射。 + +## 5.2.5 language/__init__.py +```text +third_party/ascend/backend/spec/triton/language/__init__.py +``` + +第一步: +根据后端需要,增加语言层全局变量注入函数,例如: +```python +def language_extend_globals(globals_dict): + ... +``` + +第二步: +将后端新增原语导出到 `triton.language` 顶层,例如: +```python +def language_extend_exports(globals_dict, all_list): + from triton.language.core import make_tensor_descriptor, load_tensor_descriptor, store_tensor_descriptor, gather + from triton.language.standard import topk + ... +``` + +# 5.2.6 _utils.py +```text +third_party/ascend/backend/spec/triton/language/_utils.py +``` + +第一步: +将主干文件: +```text +python/triton/language/_utils.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/language/_utils.py +``` + +第二步: +检查 `core.py` 是否已经引入了新的 `_utils` 依赖。 + +对于 Ascend,主干中是: +```python +from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape +``` + +而 Ascend 版 `core.py` 已经变成: +```python +from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth +``` + +第三步: +按后端需求调整 `validate_block_shape(...)` 之类的基础检查规则。 + +例如主干中有: +```python +if not is_power_of_two(d): + raise ValueError(...) +``` + +第四步: +在 `_utils.py` 中补充后端需要的 dtype / bitwidth / canonicalize 工具,例如: +- `type_canonicalisation_dict` +- `canonicalize_dtype(dtype)` +- `BITWIDTH_DICT` +- `get_primitive_bitwidth(dtype)` + +## 5.3 runtime模块 +- 如果后端已经对: + - kernel 启动路径; + - autotune 配置; + - 本地缓存路径; + - library entry; + - 解释执行; + + 这些行为提出了新要求,就要继续联动到 runtime。 + +## 5.3.1 jit.py +```text +third_party/ascend/backend/spec/triton/runtime/jit.py +``` + +第一步: +将主干文件: +```text +python/triton/runtime/jit.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/runtime/jit.py +``` + +第二步: +检查 `compiler.py` 的整文件特化是否已经改变了 JIT 编译入口依赖。 + +如果 `jit.py` 中会直接调用: +- `compile` +- `ASTSource` +- `make_backend` + +第三步: +检查 `run()` 流程中的: +- signature 推导; +- constants 组织; +- options 生成; +- cache key; +- kernel 启动参数; +是否还与主干兼容。 + +第四步: +如果后端对 `repr`、launch options、autotune metadata、grid 组织有差异,则在 `jit.py` 中补充对应逻辑。 + +## 5.3.2 autotuner.py +```text +third_party/ascend/backend/spec/triton/runtime/autotuner.py +``` + +第一步: +将主干文件: +```text +python/triton/runtime/autotuner.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/runtime/autotuner.py +``` + +第二步: +检查后端对 autotune config 的默认值是否有差异。 + +第三步: +检查后端是否需要不同的 benchmark / pre_hook / post_hook / prune 行为。 + +第四步: +如果 `jit.py` 已经改变了 kernel 启动和 config 消费方式,则 `autotuner.py` 也应联动调整。 + +## 5.3.3 interpreter.py +```text +third_party/ascend/backend/spec/triton/runtime/interpreter.py +``` + +第一步: +将主干文件: +```text +python/triton/runtime/interpreter.py +``` + +复制到: +```text +third_party/ascend/backend/spec/triton/runtime/interpreter.py +``` + +第二步: +检查后端是否对解释执行的数据类型、pointer/materialize 行为有不同要求。 + +第三步: +如果 language 层已经引入: +- 新 dtype; +- 新 pointer 语义; +- 新 builtin; + +则 interpreter 往往要一起补这些行为。 + +## 5.3.4 code_cache.py +```text +third_party/ascend/backend/spec/triton/runtime/code_cache.py +``` + +第一步: +如果后端不想复用主干 `runtime/cache.py` 的缓存目录组织方式,就新增自己的缓存模块。 + +Ascend 当前提供的是: +```text +third_party/ascend/backend/spec/triton/runtime/code_cache.py +``` + +第二步: +定义后端自己的缓存根目录,例如: +- `FLAGGEMS_CACHE_DIR` +- `.flaggems` + +第三步: +根据后端需求拆分: +- `code_cache_dir()` +- `config_cache_dir()` +- `clear_cache()` + +## 5.3.5 libentry.py +```text +third_party/ascend/backend/spec/triton/runtime/libentry.py +``` + +第一步: +如果后端需要额外的 library entry / autotune 数据库存储 / torch 设备适配逻辑,可以在 runtime 下新增: +```text +libentry.py +``` + +第二步: +接入后端设备接口,例如 Ascend 版中: +```python +import torch_npu +torch_device_fn = torch.npu +``` + +第三步: +如果后端需要独立维护 tuned config 数据库、kernel cache 入口、library kernel 启动接口,可在该文件中实现。 + +注意事项: +`libentry.py` 不一定在每个后端都需要;它更像是 runtime 子体系里的后端新增模块,而不是所有后端都必须照抄的固定文件。 From 2788eeb1f9f98b9cb71d6223aaad89c5875b84f3 Mon Sep 17 00:00:00 2001 From: flagtree-bot Date: Tue, 28 Apr 2026 10:34:29 +0000 Subject: [PATCH 2/2] Apply code-format changes --- .../decoupling/FlagTree_Backend_Specialization_Python.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documents/decoupling/FlagTree_Backend_Specialization_Python.md b/documents/decoupling/FlagTree_Backend_Specialization_Python.md index 7be43194fc..68ae353dda 100644 --- a/documents/decoupling/FlagTree_Backend_Specialization_Python.md +++ b/documents/decoupling/FlagTree_Backend_Specialization_Python.md @@ -290,7 +290,7 @@ third_party/Ascend/backend/spec/triton/compiler/compiler.py 主干 `python/triton/compiler/__init__.py` 中有: ```python from triton.runtime.driver import spec_path - + spec_path(__path__) ``` @@ -564,7 +564,7 @@ from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive - `load_tensor_descriptor` - `store_tensor_descriptor` - `gather` -- +- 第五步: 确认这些新增原语是否需要: - `@builtin`