From c2757cc29b710e35dc1a55ab4d7a3aaf4f70df0f Mon Sep 17 00:00:00 2001 From: Alexander Efimov Date: Wed, 24 Sep 2025 22:21:21 +0200 Subject: [PATCH 1/2] [AMD] Option to use custom scheduler in LLVM backend This PR generalize flags in translate_to_asm function and adds new variant to HIPOptions.schedule_hint for iterative-ilp scheduler. --- python/src/llvm.cc | 31 +++++++++++++++++++---------- third_party/amd/backend/compiler.py | 12 +++++++++-- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/python/src/llvm.cc b/python/src/llvm.cc index 9092baeb781c..5d9889880cba 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -67,19 +67,26 @@ createTargetMachine(llvm::Module *module, std::string proc, return machine; } -std::string translateLLVMIRToASM(llvm::Module &module, - const std::string &triple, - const std::string &proc, - const std::string &features, - const std::vector &flags, - bool enable_fp_fusion, bool isObject) { +std::string translateLLVMIRToASM( + llvm::Module &module, const std::string &triple, const std::string &proc, + const std::string &features, + const std::vector>> + &flags, + bool enable_fp_fusion, bool isObject) { using namespace mlir; // options auto options = llvm::cl::getRegisteredOptions(); - for (std::string flag : flags) { - auto *shortPtr = static_cast *>(options[flag]); - assert(shortPtr); - shortPtr->setValue(true); + for (auto flag : flags) { + if (std::string *strVal = std::get_if(&flag.second)) { + auto *shortPtr = + static_cast *>(options[flag.first]); + assert(shortPtr); + shortPtr->setValue(*strVal); + } else if (bool *boolVal = std::get_if(&flag.second)) { + auto *shortPtr = static_cast *>(options[flag.first]); + assert(shortPtr); + shortPtr->setValue(*boolVal); + } } if (triton::tools::getBoolEnv("LLVM_IR_ENABLE_DUMP")) { auto optIt = options.find("print-after-all"); @@ -423,7 +430,9 @@ void init_triton_llvm(py::module &&m) { m.def( "translate_to_asm", [](std::string llvmIR, std::string triple, std::string proc, - std::string features, std::vector flags, + std::string features, + std::vector>> + flags, bool enable_fp_fusion, bool isObject) -> py::object { std::string obj; { diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index d2f25e2d6d6e..811d85aa27cb 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -66,6 +66,10 @@ class HIPOptions: # attention: enables a bunch of optimizations for attention kernels, including: # - iglp 2 and sched.barrier around it # - sink-insts-to-avoid-spills flag to avoid register spills + # iterative-ilp-scheduler: enables custom instruction scheduler in backend + # + # Option allows to set multiple variants divided by commas: + # schedule_hint="attention,iterative-ilp-scheduler" schedule_hint: str = 'none' def __post_init__(self): @@ -417,8 +421,12 @@ def make_amdgcn(src, metadata, options): # into loops to avoid register spills in the MachineSinking pass, while it # can also lead to regression in some cases. But from current observation, # the regression is not significant. It would be better to have some heuristics. - if options.schedule_hint == 'attention': - flags.append('sink-insts-to-avoid-spills') + + for hint in options.schedule_hint.split(","): + if hint == 'attention': + flags.append(('sink-insts-to-avoid-spills', True)) + if hint == 'iterative-ilp-scheduler': + flags.append(('amdgpu-sched-strategy', 'iterative-ilp')) amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', flags, options.enable_fp_fusion, False) if knobs.amd.dump_amdgcn: print("// -----// AMDGCN Dump //----- //") From 559d980171dc18e2ba14f2d832ed3efe530bdfd4 Mon Sep 17 00:00:00 2001 From: Alexander Efimov Date: Mon, 29 Sep 2025 15:26:26 +0200 Subject: [PATCH 2/2] fix hint processing and add simple test for schduler --- python/test/unit/language/test_core.py | 37 ++++++++++++++++++++++++++ third_party/amd/backend/compiler.py | 3 ++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 5daa3f27e98b..a6b7919265fa 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -8118,3 +8118,40 @@ def kernel(): tl.device_assert(tl.sum(x) == x.sum()) kernel[(1, )]() + + +def test_schedule_hint(device): + if not is_hip(): + pytest.skip("schedule_hint option is defined only for HIP") + + @triton.jit + def kernel(X, Y, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr): + off_m = tl.arange(0, BLOCK_M) + off_n = tl.arange(0, BLOCK_N) + off_k = tl.arange(0, BLOCK_K) + Xs = X + off_m[:, None] * BLOCK_K + off_k[None, :] * 1 + Ys = Y + off_k[:, None] * 1 + off_n[None, :] * BLOCK_K + Zs = Z + off_m[:, None] * BLOCK_N + off_n[None, :] * 1 + x = tl.load(Xs) + y = tl.load(Ys) + z = tl.dot(x, y) + tl.store(Zs, z) + + # input + rs = RandomState(17) + M = 128 + N = 128 + K = 128 + + pgm_default = kernel.warmup(torch.float32, torch.float32, torch.float32, M, N, K, grid=(1, )) + pgm_ilp = kernel.warmup(torch.float32, torch.float32, torch.float32, M, N, K, + schedule_hint="iterative-ilp-scheduler", grid=(1, )) + + def get_num_vgprs(text): + for line in text.split("\n"): + if ".vgpr_count" in line: + return int(line.split(" ")[-1]) + + default_vgprs = get_num_vgprs(pgm_default.asm['amdgcn']) + ilp_vgprs = get_num_vgprs(pgm_ilp.asm['amdgcn']) + assert ilp_vgprs != default_vgprs diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 811d85aa27cb..4087a00845b2 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -235,7 +235,8 @@ def make_ttgir(mod, metadata, options): amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch) passes.common.add_canonicalizer(pm) if options.schedule_hint.lower() != "none": - amd.passes.ttgpuir.insert_instruction_sched_hints(pm, options.schedule_hint) + for hint in options.schedule_hint.split(","): + amd.passes.ttgpuir.insert_instruction_sched_hints(pm, hint) passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) if is_in_thread_transpose_enabled(options.arch):