diff --git a/amd/comgr/src/hotswap/CMakeLists.txt b/amd/comgr/src/hotswap/CMakeLists.txt index d81ff8f22ca5c..acd4351aaaf73 100644 --- a/amd/comgr/src/hotswap/CMakeLists.txt +++ b/amd/comgr/src/hotswap/CMakeLists.txt @@ -45,6 +45,7 @@ add_library(hotswap-transpiler OBJECT raiser.cpp code_object_utils.cpp mc_state.cpp + canonical_op.cpp ) if(NOT TARGET hotswap::transpiler) diff --git a/amd/comgr/src/hotswap/amdgpu_formats.h b/amd/comgr/src/hotswap/amdgpu_formats.h new file mode 100644 index 0000000000000..fbc9c0bb42841 --- /dev/null +++ b/amd/comgr/src/hotswap/amdgpu_formats.h @@ -0,0 +1,80 @@ +//===- amdgpu_formats.h - Hotswap transpiler ------------------------------===// +// +// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See +// amd/comgr/LICENSE.TXT in this repository for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef HOTSWAP_TRANSPILER_AMDGPU_FORMATS_H +#define HOTSWAP_TRANSPILER_AMDGPU_FORMATS_H + +#include + +// Source tree: lib/Target/AMDGPU/SIDefines.h — target-private but exposed +// through the LLVM build tree via our CMake include path. Provides the +// authoritative `SIInstrFlags` enum and `AMDGPU::OPERAND_INPUT_MODS` operand +// type used by the disassembler's TSFlags / OperandType fields. +#include "SIDefines.h" + +#include "Utils/AMDGPUBaseInfo.h" // AMDGPU::isVOPD + +namespace COMGR::hotswap { + +// Alias `COMGR::hotswap::SIInstrFlags` to the LLVM namespace so existing call +// sites (`SIInstrFlags::SOPP`, `SIInstrFlags::FLAT`, etc.) keep compiling. +namespace SIInstrFlags = llvm::SIInstrFlags; + +// AMDGPU target-specific operand type for VOP3 source modifiers (abs, neg). +// Defined in llvm::AMDGPU::OperandType from SIDefines.h. +constexpr unsigned OPERAND_INPUT_MODS = llvm::AMDGPU::OPERAND_INPUT_MODS; + +// Human-readable format label for diagnostics. There is no runtime dispatch +// on this string — it is consumed only by error messages in the decoder. +// The precedence of the TSFlags tests below mirrors LLVM's own decoder: +// * `IsMAI` is a VOP3 subclass, so check before VOP3. +// * `DPP` / `SDWA` are orthogonal encoding bits that coexist with +// VOP1/VOP2/VOPC; check them first so those aren't misnamed as VOP1/2. +// * `VOP3P` coexists with `VOP3` on some subtargets; check VOP3P first. +// * VOPD has no dedicated TSFlags bit (LLVM's VOPD3 bit varies across +// versions); use `AMDGPU::isVOPD(opc)` instead. +inline const char *formatName(uint64_t flags, unsigned opc) { + if (llvm::AMDGPU::isVOPD(opc)) return "VOPD"; + if (flags & SIInstrFlags::IsMAI) return "MFMA"; + if (flags & SIInstrFlags::DPP) return "DPP"; + if (flags & SIInstrFlags::SDWA) return "SDWA"; + if (flags & SIInstrFlags::SOPP) return "SOPP"; + if (flags & SIInstrFlags::SOPC) return "SOPC"; + if (flags & SIInstrFlags::SOP1) return "SOP1"; + if (flags & SIInstrFlags::SOP2) return "SOP2"; + if (flags & SIInstrFlags::SOPK) return "SOPK"; + if (flags & SIInstrFlags::VOPC) return "VOPC"; + if (flags & SIInstrFlags::VOP3P) return "VOP3P"; + if (flags & SIInstrFlags::VOP3) return "VOP3"; + if (flags & SIInstrFlags::VOP2) return "VOP2"; + if (flags & SIInstrFlags::VOP1) return "VOP1"; + if (flags & SIInstrFlags::SMRD) return "SMEM"; + if (flags & SIInstrFlags::FLAT) return "FLAT"; + if (flags & SIInstrFlags::MUBUF) return "MUBUF"; + if (flags & SIInstrFlags::DS) return "DS"; + // VIMAGE: gfx12+ vector image / tensor encoding family. Pure-image + // members carry `SIInstrFlags::VIMAGE` directly; the gfx1250 TENSOR + // pseudos (`tensor_load_to_lds_d{2,4}`, + // `tensor_store_from_lds_d{2,4}`, MIMGInstructions.td:2049-2113) do + // NOT — they extend `InstSI` directly and only set `let VALU = 1` + // and `let TENSOR_CNT = 1`, so the `VIMAGE` field stays 0. Detect + // them via the `TENSOR_CNT` TSFlags bit instead. The only other + // user of that bit is `s_wait_tensorcnt` (SOPP), which already + // matches the SOPP arm above and never reaches this fallthrough. + // Routing both arms to the same `"VIMAGE"` label lets + // `kerneldex`/`raise_cli` bucket the cross-target failures as + // `[format=VIMAGE]` rather than `[format=Unknown]`, which is what + // the handler refusal contract is keyed on. + if (flags & SIInstrFlags::VIMAGE) return "VIMAGE"; + if (flags & SIInstrFlags::TENSOR_CNT) return "VIMAGE"; + return "Unknown"; +} + +} // namespace COMGR::hotswap + +#endif diff --git a/amd/comgr/src/hotswap/canonical_op.cpp b/amd/comgr/src/hotswap/canonical_op.cpp new file mode 100644 index 0000000000000..8dd8e8a213f52 --- /dev/null +++ b/amd/comgr/src/hotswap/canonical_op.cpp @@ -0,0 +1,270 @@ +//===- canonical_op.cpp - Hotswap transpiler ------------------------------===// +// +// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See +// amd/comgr/LICENSE.TXT in this repository for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "canonical_op.h" + +namespace COMGR::hotswap { + +// Mechanical enum-to-name switch. Kept in alphabetical-ish groups to +// match the layout of the `CanonicalOp` enum in `canonical_op.h` — if a new +// CanonicalOp lands in the enum, the compiler's `-Wswitch` (enabled by +// default for scoped enums) will flag the missing case here. +const char *canonicalOpName(CanonicalOp Op) { +#define S(N) case CanonicalOp::N: return #N; + switch (Op) { + S(Unknown) + // SOPP / control flow + S(S_ENDPGM) S(S_NOP) S(S_BRANCH) S(S_CODE_END) + S(S_CBRANCH_SCC0) S(S_CBRANCH_SCC1) + S(S_CBRANCH_VCCZ) S(S_CBRANCH_VCCNZ) + S(S_CBRANCH_EXECZ) S(S_CBRANCH_EXECNZ) + S(S_WAITCNT) S(S_WAIT_LOADCNT) S(S_WAIT_KMCNT) S(S_WAIT_DSCNT) S(S_WAIT_XCNT) + S(S_WAIT_ASYNCCNT) S(S_WAIT_TENSORCNT) + S(S_WAIT_LOADCNT_DSCNT) S(S_WAIT_ALU) + S(S_CLAUSE) S(S_DELAY_ALU) S(S_SET_GPR_IDX_ON) S(S_SET_GPR_IDX_OFF) S(S_SETVSKIP) + S(S_BARRIER) S(S_BARRIER_WAIT) S(S_BARRIER_SIGNAL) + // SMEM + S(S_LOAD_B32) S(S_LOAD_B64) S(S_LOAD_B96) S(S_LOAD_B128) + S(S_LOAD_B256) S(S_LOAD_B512) + S(S_LOAD_U8) S(S_LOAD_I8) S(S_LOAD_U16) S(S_LOAD_I16) + S(S_STORE_B32) S(S_STORE_B64) S(S_STORE_B128) + // SOPC + S(S_CMP_EQ_U32) S(S_CMP_LG_U32) S(S_CMP_GT_U32) S(S_CMP_GE_U32) + S(S_CMP_LT_U32) S(S_CMP_LE_U32) + S(S_CMP_EQ_U64) S(S_CMP_LG_U64) + S(S_CMP_EQ_I32) S(S_CMP_LG_I32) S(S_CMP_GT_I32) S(S_CMP_GE_I32) + S(S_CMP_LT_I32) S(S_CMP_LE_I32) + S(S_CMP_EQ_F32) S(S_CMP_LG_F32) S(S_CMP_GT_F32) S(S_CMP_GE_F32) + S(S_CMP_LT_F32) S(S_CMP_LE_F32) S(S_CMP_NEQ_F32) + S(S_CMP_NGT_F32) S(S_CMP_NGE_F32) S(S_CMP_NLT_F32) S(S_CMP_NLE_F32) S(S_CMP_NLG_F32) + S(S_CMP_EQ_F16) S(S_CMP_LG_F16) S(S_CMP_GT_F16) S(S_CMP_GE_F16) + S(S_CMP_LT_F16) S(S_CMP_LE_F16) S(S_CMP_NEQ_F16) + S(S_CMP_NGT_F16) S(S_CMP_NGE_F16) S(S_CMP_NLT_F16) S(S_CMP_NLE_F16) S(S_CMP_NLG_F16) + // SOPK + S(S_MOVK_I32) S(S_ADDK_I32) S(S_MULK_I32) + S(S_CMPK_GE_I32) S(S_CMPK_GT_I32) S(S_CMPK_LE_I32) S(S_CMPK_LT_I32) + S(S_CMPK_GE_U32) S(S_CMPK_GT_U32) S(S_CMPK_LE_U32) S(S_CMPK_LT_U32) + S(S_CMPK_EQ_I32) S(S_CMPK_EQ_U32) S(S_CMPK_LG_I32) S(S_CMPK_LG_U32) + S(S_GETREG_B32) S(S_SETREG_B32) S(S_SETREG_IMM32_B32) + // SOP1 + S(S_MOV_B32) S(S_MOV_B64) S(S_NOT_B32) S(S_NOT_B64) + S(S_BREV_B32) S(S_FF1_I32_B32) S(S_FF1_I32_B64) + S(S_FF0_I32_B32) S(S_FF0_I32_B64) + S(S_FLBIT_I32_B32) S(S_FLBIT_I32_B64) S(S_FLBIT_I32) S(S_FLBIT_I32_I64) + S(S_SEXT_I32_I8) S(S_SEXT_I32_I16) + S(S_CVT_F32_U32) S(S_CVT_F32_I32) S(S_CVT_U32_F32) S(S_CVT_I32_F32) + S(S_AND_SAVEEXEC_B32) S(S_OR_SAVEEXEC_B32) S(S_XOR_SAVEEXEC_B32) + S(S_ANDN2_SAVEEXEC_B32) S(S_ORN2_SAVEEXEC_B32) + S(S_GETPC_B64) + S(S_SET_PC_I64) + S(S_SWAP_PC_I64) + S(S_ABS_I32) + S(S_SET_VGPR_MSB) + S(S_BITSET0_B32) S(S_BITSET1_B32) + S(S_BITSET0_B64) S(S_BITSET1_B64) + S(S_BITCMP0_B32) S(S_BITCMP1_B32) + S(S_BITCMP0_B64) S(S_BITCMP1_B64) + S(S_CMOV_B32) S(S_CMOV_B64) + // SOP2 + S(S_ADD_U32) S(S_ADDC_U32) S(S_SUB_U32) S(S_SUBB_U32) + S(S_AND_B32) S(S_AND_B64) S(S_OR_B32) S(S_OR_B64) S(S_XOR_B32) S(S_XOR_B64) + S(S_ANDN2_B32) S(S_ANDN2_B64) S(S_ORN2_B32) S(S_ORN2_B64) + S(S_NAND_B32) S(S_NAND_B64) S(S_NOR_B32) S(S_NOR_B64) + S(S_XNOR_B32) S(S_XNOR_B64) + S(S_ABSDIFF_I32) + S(S_LSHL_B32) S(S_LSHL_B64) S(S_LSHR_B32) S(S_LSHR_B64) S(S_ASHR_I32) S(S_ASHR_I64) + S(S_MUL_I32) S(S_MUL_HI_U32) S(S_MUL_HI_I32) S(S_MUL_U64) S(S_MUL_F32) S(S_ADD_F32) S(S_SUB_F32) + S(S_FMAC_F32) + S(S_MAX_NUM_F32) S(S_MIN_NUM_F32) + S(S_BFE_U32) S(S_BFE_I32) S(S_BFM_B32) S(S_BFM_B64) + S(S_CSELECT_B32) S(S_CSELECT_B64) + S(S_MIN_I32) S(S_MIN_U32) S(S_MAX_I32) S(S_MAX_U32) + S(S_PACK_LL_B32_B16) S(S_PACK_LH_B32_B16) + S(S_LSHL1_ADD_U32) S(S_LSHL2_ADD_U32) S(S_LSHL3_ADD_U32) S(S_LSHL4_ADD_U32) + S(S_ADD_NC_U64) S(S_SUB_NC_U64) + // VOP1 + S(V_MOV_B32) S(V_MOV_B64) S(V_NOP) S(V_NOT_B32) S(V_BFREV_B32) + S(V_SWAP_B32) + S(V_CVT_F32_I32) S(V_CVT_F32_U32) S(V_CVT_I32_F32) S(V_CVT_U32_F32) + S(V_CVT_F16_F32) S(V_CVT_F32_F16) S(V_CVT_F32_BF16) + S(V_CVT_F32_UBYTE0) S(V_CVT_F32_UBYTE1) S(V_CVT_F32_UBYTE2) S(V_CVT_F32_UBYTE3) + S(V_CVT_F64_U32) S(V_CVT_F64_I32) S(V_CVT_U32_F64) + S(V_RCP_IFLAG_F32) S(V_RCP_F32) S(V_RSQ_F32) S(V_SQRT_F32) + S(V_EXP_F32) S(V_LOG_F32) + S(V_S_EXP_F32) S(V_S_LOG_F32) S(V_S_RCP_F32) S(V_S_RSQ_F32) S(V_S_SQRT_F32) + S(V_LDEXP_F32) + S(V_FLOOR_F32) S(V_CEIL_F32) S(V_TRUNC_F32) S(V_FRACT_F32) + S(V_READFIRSTLANE_B32) + S(V_FFBH_U32) S(V_FFBL_B32) S(V_FFBH_I32) + S(V_CVT_PK_F32_FP8) S(V_CVT_PK_F32_BF8) + S(V_CVT_F32_FP8) S(V_CVT_F32_BF8) + S(V_CVT_SCALE_PK8_BF16_FP4) + // VOP2 / VOP3 + S(V_ADD_F32) S(V_SUB_F32) S(V_SUBREV_F32) S(V_MUL_F32) + S(V_FMAC_F32) S(V_FMA_F32) S(V_FMAMK_F32) S(V_FMAAK_F32) + S(V_MAX_F32) S(V_MIN_F32) + S(V_ADD_NC_U32) S(V_SUB_NC_U32) S(V_SUBREV_NC_U32) + S(V_ADD_CO_U32) S(V_ADD_CO_CI_U32) + S(V_SUB_CO_U32) S(V_SUBREV_CO_U32) S(V_SUB_CO_CI_U32) S(V_SUBREV_CO_CI_U32) + S(V_AND_B32) S(V_OR_B32) S(V_XOR_B32) S(V_XNOR_B32) + S(V_LSHLREV_B32) S(V_LSHRREV_B32) S(V_ASHRREV_I32) + S(V_CNDMASK_B32) + S(V_MUL_LO_U32) S(V_MUL_HI_U32) S(V_MUL_HI_I32) + S(V_MUL_I32_I24) S(V_MUL_U32_U24) S(V_MUL_HI_U32_U24) S(V_MUL_HI_I32_I24) + S(V_MAD_U32_U24) S(V_MAD_U32) + S(V_ADD3_U32) S(V_LSHL_ADD_U32) S(V_ADD_LSHL_U32) + S(V_LSHL_OR_B32) S(V_AND_OR_B32) S(V_OR3_B32) S(V_XAD_U32) S(V_XOR3_B32) + S(V_ALIGNBIT_B32) + S(V_ADD_NC_U16) + S(V_BFE_U32) S(V_BFE_I32) S(V_BFI_B32) S(V_PERM_B32) + S(V_MBCNT_LO_U32_B32) S(V_MBCNT_HI_U32_B32) + S(V_READLANE_B32) S(V_WRITELANE_B32) + S(V_MED3_F32) S(V_MAX3_F32) S(V_MIN3_F32) S(V_MAX3_NUM_F32) + S(V_MAX3_U32) S(V_MED3_I32) S(V_MINMAX_NUM_F32) + S(V_MAX_NUM_F32) S(V_MIN_NUM_F32) + S(V_MAXIMUM_F32) S(V_MINIMUM_F32) + S(V_DIV_FIXUP_F32) S(V_DIV_FMAS_F32) S(V_DIV_SCALE_F32) + S(V_FMA_MIX_F32) S(V_FMA_MIX_F32_BF16) S(V_FMA_MIXLO_BF16) + S(V_ADD_F16) S(V_MUL_F16) S(V_SUB_F16) S(V_SUBREV_F16) + S(V_MAC_F16) S(V_FMAC_F16) S(V_MADMK_F16) S(V_MADAK_F16) + S(V_MAX_F16) S(V_MIN_F16) S(V_LDEXP_F16) S(V_FLOOR_F16) + S(V_CVT_F16_U16) S(V_CVT_U16_F16) + S(V_ASHRREV_I16) S(V_LSHRREV_B16) S(V_LSHLREV_B16) + S(V_MAX_U16) S(V_MIN_U16) S(V_MAX_I16) S(V_MIN_I16) + S(V_ADD_U16) S(V_SUB_U16) S(V_SUBREV_U16) S(V_MUL_LO_U16) + S(V_DOT2C_I32_I16) S(V_DOT4C_I32_I8) S(V_DOT8C_I32_I4) + S(V_PK_FMAC_F16) + S(V_PACK_B32_F16) + S(V_CVT_PK_BF16_F32) S(V_CVT_PK_BF8_F32) S(V_CVT_PK_FP8_F32) + S(V_CVT_PKRTZ_F16_F32) S(V_CVT_PK_F16_F32) + S(V_CVT_SCALEF32_PK_FP4_F32) + S(V_BFM_B32) + // VOP2/VOP3 FP64 + S(V_ADD_F64) S(V_MUL_F64) S(V_FMA_F64) S(V_FMAC_F64) + S(V_RCP_F64) + S(V_MAX_U32) S(V_MIN_U32) S(V_MAX_I32) S(V_MIN_I32) + S(V_PERMLANE16_B32) S(V_PERMLANEX16_B32) S(V_PERMLANE64_B32) + S(V_PERMLANE16_SWAP_B32) S(V_PERMLANE32_SWAP_B32) + // VOPC + S(V_CMP) S(V_CMPX) + // VOP3P + S(V_PK_ADD_F32) S(V_PK_MUL_F32) S(V_PK_FMA_F32) + S(V_PK_MAX_F32) S(V_PK_MIN_F32) S(V_PK_MOV_B32) + S(V_PK_ADD_U16) S(V_PK_LSHLREV_B16) + S(V_BITOP3_B32) S(V_BITOP3_B16) + S(V_ADD_I32) S(V_SUB_I32) + S(V_LSHLREV_B64) S(V_LSHRREV_B64) S(V_ASHRREV_I64) + S(V_LSHL_ADD_U64) S(V_ADD_NC_U64) S(V_SUB_NC_U64) + S(V_MAX_I64) S(V_MAX_U64) S(V_MIN_I64) S(V_MIN_U64) + S(V_MUL_U64) + S(V_MAD_U64_U32) S(V_MAD_CO_U64_U32) + S(V_MAD_NC_U64_U32) S(V_MAD_NC_I64_I32) + // FLAT / GLOBAL + S(FLAT_LOAD_UBYTE) S(FLAT_LOAD_SBYTE) S(FLAT_LOAD_USHORT) S(FLAT_LOAD_SSHORT) + S(FLAT_LOAD_DWORD) S(FLAT_LOAD_DWORDX2) S(FLAT_LOAD_DWORDX3) S(FLAT_LOAD_DWORDX4) + S(FLAT_STORE_BYTE) S(FLAT_STORE_SHORT) S(FLAT_STORE_SHORT_D16_HI) + S(FLAT_STORE_DWORD) S(FLAT_STORE_DWORDX2) S(FLAT_STORE_DWORDX3) S(FLAT_STORE_DWORDX4) + S(GLOBAL_LOAD_UBYTE) S(GLOBAL_LOAD_SBYTE) S(GLOBAL_LOAD_USHORT) S(GLOBAL_LOAD_SSHORT) + S(GLOBAL_LOAD_SHORT_D16_HI) + S(GLOBAL_LOAD_DWORD) S(GLOBAL_LOAD_DWORDX2) S(GLOBAL_LOAD_DWORDX3) S(GLOBAL_LOAD_DWORDX4) + S(GLOBAL_STORE_BYTE) S(GLOBAL_STORE_SHORT) S(GLOBAL_STORE_SHORT_D16_HI) + S(GLOBAL_STORE_DWORD) S(GLOBAL_STORE_DWORDX2) S(GLOBAL_STORE_DWORDX3) S(GLOBAL_STORE_DWORDX4) + S(SCRATCH_LOAD_DWORD) S(SCRATCH_LOAD_DWORDX2) S(SCRATCH_LOAD_DWORDX3) S(SCRATCH_LOAD_DWORDX4) + S(SCRATCH_STORE_DWORD) S(SCRATCH_STORE_DWORDX2) S(SCRATCH_STORE_DWORDX3) S(SCRATCH_STORE_DWORDX4) + // FLAT atomics + S(FLAT_ATOMIC_ADD) S(FLAT_ATOMIC_SUB) + S(FLAT_ATOMIC_AND) S(FLAT_ATOMIC_OR) S(FLAT_ATOMIC_XOR) + S(FLAT_ATOMIC_SMIN) S(FLAT_ATOMIC_SMAX) S(FLAT_ATOMIC_UMIN) S(FLAT_ATOMIC_UMAX) + S(FLAT_ATOMIC_SWAP) S(FLAT_ATOMIC_CMPSWAP) + S(FLAT_ATOMIC_ADD_F32) + // GLOBAL atomics + S(GLOBAL_ATOMIC_ADD) S(GLOBAL_ATOMIC_SUB) + S(GLOBAL_ATOMIC_AND) S(GLOBAL_ATOMIC_OR) S(GLOBAL_ATOMIC_XOR) + S(GLOBAL_ATOMIC_SMIN) S(GLOBAL_ATOMIC_SMAX) S(GLOBAL_ATOMIC_UMIN) S(GLOBAL_ATOMIC_UMAX) + S(GLOBAL_ATOMIC_SWAP) S(GLOBAL_ATOMIC_CMPSWAP) + S(GLOBAL_ATOMIC_ADD_F32) + S(GLOBAL_ATOMIC_PK_ADD_BF16) S(GLOBAL_ATOMIC_PK_ADD_F16) + // SMEM atomics + S(S_ATOMIC_SWAP) + S(S_ATOMIC_DEC) + // DS + S(DS_LOAD_TR16_B128) + S(DS_READ_B64_TR_B16) + S(DS_READ_B64_TR_B8) + S(DS_LOAD_TR8_B64) + S(DS_READ_B32) S(DS_READ_B64) S(DS_READ_B96) S(DS_READ_B128) + S(DS_READ2_B32) S(DS_READ2_B64) + S(DS_READ2ST64_B32) S(DS_READ2ST64_B64) + S(DS_READ_U16) S(DS_READ_I16) S(DS_READ_U8) S(DS_READ_I8) + S(DS_WRITE_B32) S(DS_WRITE_B64) S(DS_WRITE_B96) S(DS_WRITE_B128) + S(DS_WRITE2_B32) S(DS_WRITE2_B64) + S(DS_WRITE2ST64_B32) S(DS_WRITE2ST64_B64) + S(DS_WRITE_B16) S(DS_WRITE_B8) + S(DS_WRITE_B16_D16_HI) S(DS_WRITE_B8_D16_HI) + S(DS_BPERMUTE_B32) + S(DS_SWIZZLE_B32) + // MUBUF + S(BUFFER_LOAD_DWORD) S(BUFFER_LOAD_DWORDX2) S(BUFFER_LOAD_DWORDX3) S(BUFFER_LOAD_DWORDX4) + S(BUFFER_LOAD_UBYTE) S(BUFFER_LOAD_SBYTE) S(BUFFER_LOAD_USHORT) S(BUFFER_LOAD_SSHORT) + S(BUFFER_LOAD_SHORT_D16) S(BUFFER_LOAD_SHORT_D16_HI) + S(BUFFER_LOAD_UBYTE_D16) S(BUFFER_LOAD_UBYTE_D16_HI) + S(BUFFER_LOAD_SBYTE_D16) S(BUFFER_LOAD_SBYTE_D16_HI) + S(BUFFER_LOAD_DWORD_LDS) S(BUFFER_LOAD_DWORDX2_LDS) + S(BUFFER_LOAD_DWORDX4_LDS) S(BUFFER_STORE_DWORDX4_LDS) + S(BUFFER_STORE_DWORD) S(BUFFER_STORE_DWORDX2) S(BUFFER_STORE_DWORDX3) S(BUFFER_STORE_DWORDX4) + S(BUFFER_STORE_BYTE) S(BUFFER_STORE_SHORT) + // MUBUF atomics + S(BUFFER_ATOMIC_ADD) S(BUFFER_ATOMIC_SUB) + S(BUFFER_ATOMIC_AND) S(BUFFER_ATOMIC_OR) S(BUFFER_ATOMIC_XOR) + S(BUFFER_ATOMIC_SWAP) S(BUFFER_ATOMIC_CMPSWAP) + S(BUFFER_ATOMIC_ADD_F32) + S(BUFFER_ATOMIC_PK_ADD_BF16) S(BUFFER_ATOMIC_PK_ADD_F16) + // MFMA + S(V_MFMA_F32_16x16x128_F8F6F4) S(V_MFMA_SCALE_F32_16x16x128_F8F6F4) + S(V_MFMA_F32_32x32x64_F8F6F4) S(V_MFMA_SCALE_F32_32x32x64_F8F6F4) + S(V_MFMA_F32_16x16x16_F16) S(V_MFMA_F32_32x32x8_F16) + S(V_MFMA_F32_16x16x4_F32) S(V_MFMA_F32_32x32x1_F32) S(V_MFMA_F32_32x32x2_F32) + S(V_MFMA_F32_4x4x1_F32) S(V_MFMA_F32_16x16x1_F32) + S(V_MFMA_F32_32x32x4_F16) S(V_MFMA_F32_16x16x4_F16) S(V_MFMA_F32_4x4x4_F16) + S(V_MFMA_I32_16x16x32_I8) S(V_MFMA_I32_32x32x16_I8) + S(V_MFMA_I32_32x32x4_I8) S(V_MFMA_I32_16x16x4_I8) S(V_MFMA_I32_4x4x4_I8) + S(V_MFMA_F32_16x16x8_XF32) S(V_MFMA_F32_32x32x4_XF32) + S(V_MFMA_F32_32x32x2_BF16) S(V_MFMA_F32_16x16x2_BF16) S(V_MFMA_F32_4x4x2_BF16) + S(V_MFMA_F32_16x16x16_BF16_1K) S(V_MFMA_F32_32x32x8_BF16_1K) + S(V_MFMA_F32_16x16x32_BF16) S(V_MFMA_F32_32x32x16_BF16) + S(V_MFMA_F32_16x16x32_F16) + S(V_MFMA_F32_16x16x32_FP8_FP8) S(V_MFMA_F32_16x16x32_FP8_BF8) + S(V_MFMA_F32_16x16x32_BF8_FP8) S(V_MFMA_F32_16x16x32_BF8_BF8) + S(V_MFMA_F32_32x32x16_FP8_FP8) S(V_MFMA_F32_32x32x16_FP8_BF8) + S(V_MFMA_F32_32x32x16_BF8_FP8) S(V_MFMA_F32_32x32x16_BF8_BF8) + // WMMA + S(V_WMMA_F32_16x16x32_F16) S(V_WMMA_F32_16x16x32_BF16) + S(V_WMMA_F32_16x16x4_F32) + S(V_WMMA_F32_16x16x64_FP8_FP8) S(V_WMMA_F32_16x16x64_FP8_BF8) + S(V_WMMA_F32_16x16x64_BF8_FP8) S(V_WMMA_F32_16x16x64_BF8_BF8) + S(V_WMMA_I32_16x16x64_IU8) + S(V_WMMA_SCALE_F32_16x16x128_F8F6F4) + // VOPD + S(VOPD_GENERIC) + // VIMAGE TENSOR (gfx1250-only) + S(TENSOR_LOAD_TO_LDS) S(TENSOR_STORE_FROM_LDS) + // FLAT async global → LDS (gfx1250-only) + S(GLOBAL_LOAD_ASYNC_TO_LDS_B8) S(GLOBAL_LOAD_ASYNC_TO_LDS_B32) + S(GLOBAL_LOAD_ASYNC_TO_LDS_B64) S(GLOBAL_LOAD_ASYNC_TO_LDS_B128) + // FLAT VMEM prefetch (gfx1250-only, hint-class) + S(GLOBAL_PREFETCH_B8) + // AGPR + S(V_ACCVGPR_READ_B32) S(V_ACCVGPR_WRITE_B32) + + case CanonicalOp::CanonicalOp_COUNT: return ""; + } +#undef S + return ""; +} + +} // namespace COMGR::hotswap diff --git a/amd/comgr/src/hotswap/canonical_op.h b/amd/comgr/src/hotswap/canonical_op.h new file mode 100644 index 0000000000000..9025a8745b463 --- /dev/null +++ b/amd/comgr/src/hotswap/canonical_op.h @@ -0,0 +1,1169 @@ +//===- canonical_op.h - Hotswap transpiler --------------------------------===// +// +// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See +// amd/comgr/LICENSE.TXT in this repository for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef HOTSWAP_TRANSPILER_CANONICAL_OP_H +#define HOTSWAP_TRANSPILER_CANONICAL_OP_H + +#include + +namespace COMGR::hotswap { + +// Architecture-neutral instruction identity used for dispatch in the raiser. +// Each entry maps to one or more MC opcodes via OpcodeMap. +enum class CanonicalOp : uint16_t { + Unknown = 0, + + // -- SOPP / control flow -- + S_ENDPGM, S_NOP, S_BRANCH, S_CODE_END, + S_CBRANCH_SCC0, S_CBRANCH_SCC1, + S_CBRANCH_VCCZ, S_CBRANCH_VCCNZ, + S_CBRANCH_EXECZ, S_CBRANCH_EXECNZ, + S_WAITCNT, S_WAIT_LOADCNT, S_WAIT_KMCNT, S_WAIT_DSCNT, S_WAIT_XCNT, + // gfx1250 async-memory wait counters. `S_WAIT_ASYNCCNT` is the + // companion barrier for the `GLOBAL_LOAD_ASYNC_TO_LDS_B*` family + // below (and `DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64`); `S_WAIT_TENSORCNT` + // is the companion for `TENSOR_LOAD_TO_LDS` / `TENSOR_STORE_FROM_LDS`. + // Both track dependency counters that do not exist on gfx942 (no + // `ASYNCcnt` / `TENSORcnt` hardware) — the raiser lowers them as + // no-ops on every target: + // + // * On gfx942 (cross-target): the source async DMA is emulated + // as a synchronous `load`+`store` chain that has already + // completed by the time the wait is reached. IR dataflow from + // the emulated `store` through subsequent LDS reads carries + // the happens-before; the backend re-inserts an `s_waitcnt + // lgkmcnt(0)` before the reader. See the + // `GLOBAL_LOAD_ASYNC_TO_LDS_B*` CanonicalOp doc block for the full + // trade-off argument. + // * On gfx1250 (same-target): like every other wait counter in + // `handle_sopp.cpp`, the raiser relies on LLVM's memory model + // to re-emit the native wait from the IR's load/store + // ordering. The async intrinsic's + // `IntrInaccessibleMemOrArgMemOnly` annotation prevents + // reorder across the wait site, so the backend re-derives the + // correct `s_wait_asynccnt` / `s_wait_tensorcnt` from that + // scheduling constraint. + // + // Declared here (rather than inlined as a generic SOPP no-op) so + // that the opcode_map canonicalisation is explicit and a future + // reviewer touching the async family can grep + // `S_WAIT_ASYNCCNT` and find both the CanonicalOp, its opcode_map + // entry, and the handler's no-op arm in one pass. + S_WAIT_ASYNCCNT, S_WAIT_TENSORCNT, + S_WAIT_LOADCNT_DSCNT, S_WAIT_ALU, + S_CLAUSE, S_DELAY_ALU, S_SET_GPR_IDX_ON, S_SET_GPR_IDX_OFF, S_SETVSKIP, + // Barriers. GFX12+ splits s_barrier into signal + wait; earlier ISAs emit a + // single s_barrier. Handlers model signal as a no-op and wait as a full + // LLVM `amdgcn.s.barrier` call. + S_BARRIER, S_BARRIER_WAIT, S_BARRIER_SIGNAL, + + // -- SMEM -- + S_LOAD_B32, S_LOAD_B64, S_LOAD_B96, S_LOAD_B128, S_LOAD_B256, S_LOAD_B512, + // gfx12+ scalar narrow loads: fetch 1 or 2 bytes from a uniform address and + // zero/sign-extend into a 32-bit SGPR. Older ISAs have no equivalent; on a + // cross-target lift to gfx942 the backend will lower the narrow `load iN` + // to VMEM (per-lane global_load_{ubyte,sbyte,ushort,sshort}) — semantically + // correct but uniformity-lossy. See handle_smem.cpp for the design notes. + S_LOAD_U8, S_LOAD_I8, S_LOAD_U16, S_LOAD_I16, + S_STORE_B32, S_STORE_B64, S_STORE_B128, + + // -- SOPC -- + S_CMP_EQ_U32, S_CMP_LG_U32, S_CMP_GT_U32, S_CMP_GE_U32, + S_CMP_LT_U32, S_CMP_LE_U32, + // gfx8+ 64-bit unsigned scalar compares (SOPC_CMP_64). Only EQ and + // LG (not equal) are defined in SOPInstructions.td; there are no + // ordered/strict 64-bit SOPC compares on any AMDGPU generation + // because the .td record `SOPC_CMP_64` is reserved for these two. + S_CMP_EQ_U64, S_CMP_LG_U64, + S_CMP_EQ_I32, S_CMP_LG_I32, S_CMP_GT_I32, S_CMP_GE_I32, + S_CMP_LT_I32, S_CMP_LE_I32, + S_CMP_EQ_F32, S_CMP_LG_F32, S_CMP_GT_F32, S_CMP_GE_F32, + S_CMP_LT_F32, S_CMP_LE_F32, S_CMP_NEQ_F32, + S_CMP_NGT_F32, S_CMP_NGE_F32, S_CMP_NLT_F32, S_CMP_NLE_F32, S_CMP_NLG_F32, + S_CMP_EQ_F16, S_CMP_LG_F16, S_CMP_GT_F16, S_CMP_GE_F16, + S_CMP_LT_F16, S_CMP_LE_F16, S_CMP_NEQ_F16, + S_CMP_NGT_F16, S_CMP_NGE_F16, S_CMP_NLT_F16, S_CMP_NLE_F16, S_CMP_NLG_F16, + + // -- SOPK -- + S_MOVK_I32, S_ADDK_I32, S_MULK_I32, + S_CMPK_GE_I32, S_CMPK_GT_I32, S_CMPK_LE_I32, S_CMPK_LT_I32, + S_CMPK_GE_U32, S_CMPK_GT_U32, S_CMPK_LE_U32, S_CMPK_LT_U32, + S_CMPK_EQ_I32, S_CMPK_EQ_U32, S_CMPK_LG_I32, S_CMPK_LG_U32, + S_GETREG_B32, S_SETREG_B32, S_SETREG_IMM32_B32, + + // -- SOP1 -- + S_MOV_B32, S_MOV_B64, S_NOT_B32, S_NOT_B64, + S_BREV_B32, S_FF1_I32_B32, S_FF1_I32_B64, + // s_ff0_i32_b{32,64}: find first 0 bit (lowest position), returning + // -1 when the source is all-ones. SOPInstructions.td:278-279 (no + // LLVM ISel pattern is provided, so the instruction is only emitted + // by hand-written asm / inline-asm — but the corpus contains it). + // Lowers to `cttz(~src, is_zero_poison=false)` with a `cmov` to -1 + // on the all-ones input path, mirroring the V_FFBL_B32 / V_FFBH_U32 + // shape (the AMDGPU instruction returns 0xFFFFFFFF in the no-bit + // case rather than the LLVM intrinsic's bitwidth-wide return). + S_FF0_I32_B32, S_FF0_I32_B64, + S_FLBIT_I32_B32, S_FLBIT_I32_B64, + // s_flbit_i32 / s_flbit_i32_i64: signed find-leading-bit-not-equal-to- + // sign-bit. Lowers to llvm.amdgcn.sffbh, the dedicated AMDGPU + // intrinsic that selects directly back to v_ffbh_i32_e32 (or its + // i64-split lowering for the 64-bit variant). See + // SOPInstructions.td:296-298 / VOP1Instructions.td:373. + S_FLBIT_I32, S_FLBIT_I32_I64, + S_SEXT_I32_I8, S_SEXT_I32_I16, + S_CVT_F32_U32, S_CVT_F32_I32, S_CVT_U32_F32, S_CVT_I32_F32, + S_AND_SAVEEXEC_B32, S_OR_SAVEEXEC_B32, S_XOR_SAVEEXEC_B32, + S_ANDN2_SAVEEXEC_B32, S_ORN2_SAVEEXEC_B32, + S_GETPC_B64, + // SOP1 indirect set-PC. gfx1250 asm rename for `S_SETPC_B64` + // (SOPInstructions.td:323 declares `isBranch + isIndirectBranch`, + // line 2208 renames the asm string to `s_set_pc_i64`). The source + // SGPR pair holds an absolute 64-bit PC value. In our IR-on-LLVM + // setting we model three principled lowerings (see setpc_analysis.{hpp, + // cpp} for the static analysis that classifies each site): + // DirectA — statically resolvable intra-kernel branch (the + // source SGPR pair was produced by a local + // `s_get_pc_i64 + s_add_co_u32 + s_add_co_ci_u32` + // chain). Lowers to `br label %BB_target` since + // the target is a known intra-function label. + // IndirectB — subroutine return via an SGPR pair stashed at the + // call site (the canonical s[30:31] return-PC + // idiom). Lowers to a `cmp eq + br` cascade (via + // `emitEnumeratedDispatch` in handle_sop1.cpp) + // enumerating the resolved return targets and + // terminating in an `unreachable` trap BB. The + // corresponding call-site + // `s_get_pc_i64 + s_add*` chains are rewritten by + // the raiser to write the plain i64 marker + // `resolvedReturnAddr` (the source-MC byte offset + // of the intended return BB) into the ret-pair + // (via a post-handler hook in raiser.cpp), so + // each cascade `icmp eq i64 %marker, ` + // folds across the phi join under mem2reg + SCCP + // + InstCombine and SimplifyCFG collapses the + // cmp+br to a direct branch — the same final + // codegen as a fully-folded `indirectbr` would + // produce. See `emitEnumeratedDispatch`'s + // rationale block for why a cascade (LLVM's + // FixIrreducible pass only handles br-flavoured + // predecessors of an irreducible cycle header) + // and why an integer marker rather than + // `ptrtoint(blockaddress)` (AMDGPU ISel has no + // pattern to materialise a `BlockAddress` as an + // i64 register value). + // DispatchSet — multi-target dispatch via inter-block PC-chain + // dataflow: each predecessor block writes a + // different chain target into the same SGPR pair, + // then a join block consumes it through + // `s_set_pc_i64`. The dataflow in setpc_analysis + // enumerates the bounded set of targets reaching + // the use site through distinct CFG paths. Lowers + // to the same enumerated-dispatch cascade as + // IndirectB. Same chain-terminator hook as + // IndirectB writes the per-predecessor i64 marker + // (the callee's source-MC byte offset) on each + // contributing predecessor path so each cascade + // cmp folds to a constant branch after SCCP. + // Sites the analysis cannot resolve (incomplete dataflow, + // unbounded fan-in past kMaxDispatchTargets, or pair killed by an + // unmodelled write before the use site) refuse loudly via + // RaiseFailure::unsupportedShape — never silently emit a stub. + S_SET_PC_I64, + // SOP1 branch-and-link. gfx1250 asm rename for `S_SWAPPC_B64` + // (SOPInstructions.td:336 declares `isCall = 1`, line 2311 renames + // the asm string to `s_swap_pc_i64`). Operands: + // sdst = sX:X+1 receives the return PC (i.e. the absolute kernel + // offset of the instruction immediately following the + // swap, swap.offset + swap.size). + // ssrc = sY:Y+1 holds the absolute call target PC. + // PC <- ssrc; sdst <- (return-PC) (atomically) + // + // Three principled raisings, mirroring S_SET_PC_I64: + // DirectA — call target ssrc was produced by a local + // `s_get_pc_i64 + s_add_co_u32 + s_add_co_ci_u32` + // chain that resolves intra-block. Lowering writes + // the return-address marker (the plain i64 + // source-MC byte offset of swap.offset+swap.size) + // into sdst and emits `br label %BB_callee`. + // DispatchSet — call target reached via inter-block PC-chain + // dataflow (the tensilelite "activation function + // dispatcher" shape: each predecessor block + // computes a distinct callee target into the same + // pair via its own getpc+add chain, then a join + // block executes `s_swap_pc_i64`). Lowering writes + // the return-address marker into sdst as in + // DirectA, then emits a `cmp eq + br` cascade + // (via `emitEnumeratedDispatch` in + // handle_sop1.cpp) over the enumerated callee + // targets, terminating in an `unreachable` trap + // BB. The chain-terminator hook in raiser.cpp + // rewrites ssrc to hold the callee's i64 marker + // (source-MC byte offset) on every contributing + // predecessor path so each cascade cmp folds to + // a constant branch after SCCP. See + // `emitEnumeratedDispatch`'s rationale block for + // why a cascade (FixIrreducible compatibility + // under irreducible CFGs — the dominant shape + // this pattern produces) and why an integer + // marker rather than `ptrtoint(blockaddress)` + // (AMDGPU ISel cannot materialise a + // `BlockAddress` as an i64). + // Unresolvable — call target cannot be statically enumerated + // (incomplete dataflow, fan-in past + // kMaxDispatchTargets, or runtime-derived value). + // Refuse loudly via RaiseFailure::unsupportedShape + // — never emit a stub branch. + // + // The analysis never produces IndirectB for a swap_pc site (a + // swap_pc's source pair is the call target, not a return slot; + // IndirectB describes the return-side use of such a pair). + // + // Independent of the call-target classification, the analysis + // registers a synthetic chain-terminator at the swap site itself + // (key = swap.offset, value = {sdst-low-reg, swap.offset+swap.size}) + // so any downstream IndirectB `s_set_pc_i64` reading sdst + // enumerates the swap's return offset as one of its cascade + // targets. + S_SWAP_PC_I64, + S_ABS_I32, + S_SET_VGPR_MSB, + // Read-modify-write bit set/clear on an SGPR. Tied src keeps the + // un-touched bits of the destination register alive across the op. + // B64 variants index into 64 bits (bit index is still an SReg_32). + S_BITSET0_B32, S_BITSET1_B32, + S_BITSET0_B64, S_BITSET1_B64, + // SOPC bit-test family (SOPInstructions.td:1411-1414; gfx6+ on every + // AMDGPU generation, so fully cross-target viable). Tests a single + // bit of src0 selected by src1 (src1's lower 5 bits for _B32, lower + // 6 bits for _B64) and writes the result into SCC: + // S_BITCMP0_B32 SCC = (src0 & (1u << (src1 & 0x1F))) == 0 + // S_BITCMP1_B32 SCC = (src0 & (1u << (src1 & 0x1F))) != 0 + // S_BITCMP0_B64 SCC = (src0 & (1ull << (src1 & 0x3F))) == 0 + // S_BITCMP1_B64 SCC = (src0 & (1ull << (src1 & 0x3F))) != 0 + // For _B64, src0 is a 64-bit SGPR pair; src1 remains a 32-bit SReg + // whose high 26/27 bits are ignored by the hardware (we apply the + // mask in IR to preserve that invariant exactly instead of relying + // on undef-width behaviour). The handler lives in handle_sopc.cpp + // next to the SOPC compares it mirrors. + S_BITCMP0_B32, S_BITCMP1_B32, + S_BITCMP0_B64, S_BITCMP1_B64, + // Conditional move on SCC. `if (SCC) sdst = src; else sdst stays + // unchanged.` The dst-on-SCC=0 read-modify is NOT modeled by LLVM + // as a tied sdst_in operand on the MCInst (SOP1_32/SOP1_64 just + // declares `(outs sdst), (ins src0)`), so the handler must + // explicitly read the prior dst value via + // `ctx.regs.readReg{32,64}(op.dst())`. SCC is read but not + // written. + S_CMOV_B32, S_CMOV_B64, + + // -- SOP2 -- + // `S_ADD_U64` used to live here as a second CanonicalOp alongside the + // gfx12-renamed `S_ADD_NC_U64` below, created by the same commit + // that first added the `s_add_u64` opcode-map row. The row in + // `opcode_map.cpp` was later replaced with the gfx12-renamed + // `CanonicalOp::S_ADD_NC_U64` target (matching S_SUB_U64 → S_SUB_NC_U64), + // but the old `CanonicalOp::S_ADD_U64` enum entry + a defensive `||` + // branch in `handle_sop2.cpp` + a stray duplicate opcode-map row + // were left behind. `canonToSem.try_emplace` (opcode_map.cpp:1506 + // keeps-first) silently routed lifts through the stale enum value; + // the handler's `||` disjunct masked the difference. See + // `opcode_map.cpp`'s S_ADD_U64 block comment for the full audit. + // `CanonicalOp::S_ADD_NC_U64` below is now the ONLY CanonicalOp for LLVM's + // `S_ADD_U64` pseudo. + S_ADD_U32, S_ADDC_U32, S_SUB_U32, S_SUBB_U32, + S_AND_B32, S_AND_B64, S_OR_B32, S_OR_B64, S_XOR_B32, S_XOR_B64, + S_ANDN2_B32, S_ANDN2_B64, S_ORN2_B32, S_ORN2_B64, + // SOP2 negated bitops (gfx7+). SOPInstructions.td:789-803 — each + // computes `dst = ~(src0 OP src1)` and sets SCC = (result != 0). These + // are produced heavily by triton/tensilelite when constant-folding + // bitfield masks (e.g. `s_nand_b32 sX, sY, 0xffff` to clear the low + // 16 bits). All can target EXEC, so they must be marked + // routesExecThroughStoreExec. + S_NAND_B32, S_NAND_B64, S_NOR_B32, S_NOR_B64, S_XNOR_B32, S_XNOR_B64, + // SOP2 absolute-difference (gfx7+). SOPInstructions.td:886-888 — + // `dst = |src0 - src1|` on signed i32, SCC = (result != 0). Lower + // through llvm.abs.i32 with is_int_min_poison=false: hardware wraps + // for INT_MIN (the only value whose negation equals itself), so we + // mustn't poison there. Heavily used by tensilelite for stride math. + S_ABSDIFF_I32, + S_LSHL_B32, S_LSHL_B64, S_LSHR_B32, S_LSHR_B64, S_ASHR_I32, S_ASHR_I64, + S_MUL_I32, S_MUL_HI_U32, S_MUL_HI_I32, S_MUL_U64, S_MUL_F32, S_ADD_F32, S_SUB_F32, + // gfx11+ scalar fused multiply-accumulate. SOP2 encodes only two explicit + // sources; OPF_DACCUM ties the old destination value as the third operand: + // sdst.f32 = fma(ssrc0.f32, ssrc1.f32, old sdst.f32) + S_FMAC_F32, + // Scalar IEEE-754-2019 maximumNumber/minimumNumber. LLVM's canonical pseudo + // is `S_{MAX,MIN}_F32`; gfx12+ manuals name the real mnemonics + // `s_{max,min}_num_f32` and keep `s_{max,min}_f32` as compatibility aliases. + // Semantics favor a numeric operand over NaN (including signaling NaN after + // raising invalid) and order signed zeros (+0 > -0 for max, -0 < +0 for min), + // matching LLVM's `maximumnum` / `minimumnum` intrinsic contract without + // fast-math flags. + S_MAX_NUM_F32, S_MIN_NUM_F32, + S_BFE_U32, S_BFE_I32, S_BFM_B32, S_BFM_B64, + S_CSELECT_B32, S_CSELECT_B64, + S_MIN_I32, S_MIN_U32, S_MAX_I32, S_MAX_U32, + S_PACK_LL_B32_B16, S_PACK_LH_B32_B16, + S_LSHL1_ADD_U32, S_LSHL2_ADD_U32, S_LSHL3_ADD_U32, S_LSHL4_ADD_U32, + S_ADD_NC_U64, S_SUB_NC_U64, + + // -- VOP1 -- + V_MOV_B32, V_MOV_B64, V_NOP, V_NOT_B32, V_BFREV_B32, + V_SWAP_B32, + V_CVT_F32_I32, V_CVT_F32_U32, V_CVT_I32_F32, V_CVT_U32_F32, + V_CVT_F16_F32, V_CVT_F32_F16, V_CVT_F32_BF16, + V_CVT_F32_UBYTE0, V_CVT_F32_UBYTE1, V_CVT_F32_UBYTE2, V_CVT_F32_UBYTE3, + V_CVT_F64_U32, V_CVT_F64_I32, V_CVT_U32_F64, + V_RCP_IFLAG_F32, V_RCP_F32, V_RSQ_F32, V_SQRT_F32, V_EXP_F32, V_LOG_F32, + // gfx12+ VOP3 pseudo-scalar f32 transcendentals: scalar input and scalar + // output variants of the corresponding VOP1 special-function instructions. + // The default clamp=0/omod=0 forms lower through AMDGPU hardware intrinsics; + // non-default output modifiers are refused until modeled exactly. + V_S_EXP_F32, V_S_LOG_F32, V_S_RCP_F32, V_S_RSQ_F32, V_S_SQRT_F32, + V_LDEXP_F32, + V_FLOOR_F32, V_CEIL_F32, V_TRUNC_F32, V_FRACT_F32, + V_READFIRSTLANE_B32, + // VOP1 packed FP8/BF8 → 2x F32 expansion (VOP1Instructions.td:652- + // 653, profile VOPProfileCVT_PK_F32_F8). Reads 16 bits of the i32 + // src — the low half (bytes 0,1) when op_sel:[0,*] / SDWA WORD_0, + // the high half (bytes 2,3) when op_sel:[1,*] / SDWA WORD_1 — and + // expands the two FP8/BF8 lanes into a v2f32 written to a VGPR + // pair starting at vdst. FP8 is the OCP E4M3FN format; BF8 is the + // OCP E5M2 format. Lowering selects the matching + // `llvm.amdgcn.cvt.pk.f32.{fp8,bf8}(i32 src, i1 word_sel)` + // intrinsic and bitcasts the v2f32 result to i64 before + // writeReg64. The op_sel-based word selector is parsed from the + // disassembly text exactly as in V_ADD_NC_U16 / V_FMA_MIX_F32 (no + // first-class "modifier" channel exists in our OperandView yet); + // unparseable / out-of-range selectors fall through to word_sel=0 + // — never silently corrupted, the parser invariant is the same as + // for the other op_sel handlers. The reverse direction + // (V_CVT_PK_FP8_F32 / V_CVT_PK_BF8_F32) lives in the VOP3 block + // below; this is the read-side companion. + V_CVT_PK_F32_FP8, V_CVT_PK_F32_BF8, + // VOP1 single-lane FP8/BF8 → F32 expansion (VOP1Instructions.td:650- + // 651, profile VOPProfileCVT_F32_F8). Reads ONE 8-bit lane of the + // i32 src — selected by SDWA src0_sel / e64 op_sel byte_sel — and + // produces an f32. The SDWA encoding can pick any of the four bytes + // (0..3); the e64 encoding's default (no op_sel printed) is byte 0 + // and is the only shape the gfx1250 corpus emits today (the LLVM + // isel pattern in VOP1Instructions.td:670-680 maps non-zero + // byte_sel through the SDWA pseudo, which we have not yet wired — + // adding it would only widen this handler, not change its shape). + // Lowering selects `llvm.amdgcn.cvt.f32.{fp8,bf8}(i32 src, i32 + // byte_sel)` and writeReg32 the result. SDWA / op_sel-bearing + // encodings refuse loudly via RaiseFailure::unsupportedShape so a + // future corpus drift surfaces immediately rather than silently + // collapsing to byte 0. + V_CVT_F32_FP8, V_CVT_F32_BF8, + // VOP3 scaled packed-8 FP4 → BF16 conversion (gfx1250 only, + // VOP3Instructions.td:1788; LLVM opcode V_CVT_SCALE_PK8_BF16_FP4_e64, + // real form `..._gfx1250`). Reads 1 VGPR of packed 8xFP4 (4 bits + // each, 32 bits total) plus an E8M0-encoded scale value and a + // byte-granularity `scale_sel` immediate, and writes an 8xBF16 + // result across 4 consecutive VGPRs. Lowers to + // `<8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 src, + // i32 scale, + // i32 immarg sel)` + // which is declared inside the gfx1250-only block of + // IntrinsicsAMDGPU.td (AMDGPUCvtScaleIntrinsic w/ isGFX125xOnly) — + // the handler therefore gates same-target lift on + // `ctx.targetIsa.hasTensorOps` (FeatureGFX1250Insts) and refuses + // cross-target lift to gfx942 loudly (no MX-FP4 scaling unit on + // CDNA3; a manual per-nibble dequant expansion would be a separate + // design). + // + // Sibling variants (V_CVT_SCALE_PK8_{F16,F32}_{FP4,FP8,BF8} / + // V_CVT_SCALE_PK8_BF16_{FP8,BF8}) share the same operand shape and + // the same same-target-only constraint. Only the BF16_FP4 form is + // exercised by the current kerneldex corpus (scope_discovery + // `_matmul_ogs_NNT_bf16xbf16xmxfp4_32x256x128x1`); adding a sibling + // is a two-line change (new CanonicalOp + new entry in the handler's + // variant-classifier table) and intentionally deferred until a + // corpus kernel exercises it. + V_CVT_SCALE_PK8_BF16_FP4, + // VOP1 find-first-bit family (gfx7+, VOP1Instructions.td:371-373). + // V_FFBH_U32 -> AMDGPUffbh_u32 = ctlz_zero_undef but returns -1 on + // input 0; lower with llvm.ctlz(x, false) — LLVM + // returns the bitwidth (32) for input 0, so we cmov + // to -1 explicitly to match hardware. + // V_FFBL_B32 -> AMDGPUffbl_b32 = cttz_zero_undef but returns -1 on + // input 0; same pattern with llvm.cttz. + // V_FFBH_I32 -> AMDGPUffbh_i32 = position of highest non-sign bit; + // returns -1 for input 0 or -1 (uniform sign). Lower + // via the dedicated llvm.amdgcn.sffbh intrinsic which + // selects directly back to v_ffbh_i32_e32. + V_FFBH_U32, V_FFBL_B32, V_FFBH_I32, + + // -- VOP2 / VOP3 -- + V_ADD_F32, V_SUB_F32, V_SUBREV_F32, V_MUL_F32, + V_FMAC_F32, V_FMA_F32, V_FMAMK_F32, V_FMAAK_F32, + V_MAX_F32, V_MIN_F32, + V_ADD_NC_U32, V_SUB_NC_U32, V_SUBREV_NC_U32, + V_ADD_CO_U32, V_ADD_CO_CI_U32, + V_SUB_CO_U32, V_SUBREV_CO_U32, V_SUB_CO_CI_U32, V_SUBREV_CO_CI_U32, + V_AND_B32, V_OR_B32, V_XOR_B32, V_XNOR_B32, + V_LSHLREV_B32, V_LSHRREV_B32, V_ASHRREV_I32, + V_CNDMASK_B32, + V_MUL_LO_U32, V_MUL_HI_U32, V_MUL_HI_I32, + V_MUL_I32_I24, V_MUL_U32_U24, V_MUL_HI_U32_U24, V_MUL_HI_I32_I24, + V_MAD_U32_U24, V_MAD_U32, + V_ADD3_U32, V_LSHL_ADD_U32, V_ADD_LSHL_U32, V_LSHL_OR_B32, V_AND_OR_B32, V_OR3_B32, V_XAD_U32, + // VOP3 funnel-shift right: dst = ((src0:src1) >> src2[4:0])[31:0]. + // .td uses the SDAG `fshr` node directly (VOP3Instructions.td:222), + // which maps to `llvm.fshr.i32` in IR. src2 is masked to 5 bits + // by hardware before the shift. + V_ALIGNBIT_B32, + // VOP3 ternary xor — gfx10+ only (VOP3Instructions.td:1348), + // .td has no SDAG `umin3`-style node, the iselect pattern at + // line 1350 directly matches `(xor (xor a, b), c)`. Lift is the + // same shape as V_OR3_B32 above. + V_XOR3_B32, + // VOP3 16-bit no-carry add — gfx10+ (VOP3Instructions.td:1362). + // Op_sel routes 16-bit halves of src0/src1 (lo or hi) and + // selects which half of the 32-bit dst register receives the + // result; the unselected half of dst is preserved per the + // RDNA3+ ISA. The handler must read the prior dst value when + // dst op_sel is set so the preserved half survives the + // read-modify-write. + V_ADD_NC_U16, + V_BFE_U32, V_BFE_I32, V_BFI_B32, V_PERM_B32, + V_MBCNT_LO_U32_B32, V_MBCNT_HI_U32_B32, + V_READLANE_B32, V_WRITELANE_B32, + V_MED3_F32, V_MAX3_F32, V_MIN3_F32, V_MAX3_NUM_F32, + // VOP3 IEEE-2019 ternary clamp `minnum(maxnum(s0, s1), s2)`. + // gfx12 renamed gfx11's V_MINMAX_F32 (.td:1485, opcode 0x25f) + // to V_MINMAX_NUM_F32 (.td:1696, opcode 0x268) when the .NUM + // suffix was introduced to disambiguate from the IEEE-754 + // 2019 V_MINIMUMMAXIMUM_F32 (NaN-propagating, opcode 0x26c). + // The opcode_map collapses both real names onto this CanonicalOp. + V_MINMAX_NUM_F32, + // VOP3 integer 3-way max/min/median. The .td uses + // AMDGPU{u,s}{max,min,med}3 SDAG nodes which the backend pattern- + // matches; we lift them as the natural 2-step ICmp+Select chain + // (no LLVM `*3` IR intrinsic exists). gfx11/gfx12 keep these + // (VOP3Instructions.td:1792-1798). + V_MAX3_U32, + // VOP3 signed-integer median-of-three. Hardware semantic + // (VOP3Instructions.td:1796 via AMDGPUsmed3 SDAG node): + // med3_i32(a, b, c) = smax(smin(a, b), smin(smax(a, b), c)) + // i.e. the middle of three signed i32 values. We lift it as a + // pair of `llvm.smin`/`llvm.smax` intrinsics (matching the + // `handle_vopd.cpp` style that already uses these intrinsics for + // VOPD smin/smax/umin/umax pairs). The backend's + // `AMDGPUISelDAGToDAG`/`AMDGPUISelLowering` pattern-matches the + // `smax(smin(...), smin(smax(...), ...))` shape back to + // V_MED3_I32, so the round-trip is structure-preserving and the + // generated assembly recovers the original instruction without + // codegen quality loss. + V_MED3_I32, + V_MAX_NUM_F32, V_MIN_NUM_F32, + // IEEE-754 2019 maximum/minimum: propagate NaN (distinct from maxnum/minnum). + V_MAXIMUM_F32, V_MINIMUM_F32, + V_DIV_FIXUP_F32, V_DIV_FMAS_F32, V_DIV_SCALE_F32, + // Mixed-precision FMA, VOP3P (VOP3PInstructions.td:109). Both + // variants take three sources and reduce to + // fma(cvt_f32(src0_part), cvt_f32(src1_part), cvt_f32(src2_part)) + // where `*_part` is selected by the per-source op_sel / op_sel_hi + // modifiers: + // op_sel_hi[i]==0 -> source i is the full f32 VGPR + // op_sel_hi[i]==1 -> source i is the 16-bit lo (op_sel[i]==0) or + // hi (op_sel[i]==1) half, interpreted as the + // mnemonic's narrow type + // V_FMA_MIX_F32 : narrow type = f16 (all gfx targets) + // V_FMA_MIX_F32_BF16 : narrow type = bf16 (gfx9.5+/gfx1250; the bf16 + // narrow half → f32 extension is + // cross-target-universal via `fpext bfloat to + // float`, so no refusal is needed on gfx942) + // Both CanonicalOps share the op_sel/op_sel_hi parser and write-back shape + // in handle_valu_vop3p.cpp; only the narrow element type differs. + V_FMA_MIX_F32, V_FMA_MIX_F32_BF16, + // VOP3P BF16 destination mixed FMA (gfx1250 + // VOP3PInstructions.td:464): compute + // fptrunc_bf16(fma(cvt_f32(src0_part), + // cvt_f32(src1_part), + // cvt_f32(src2_part))) + // and write the rounded BF16 result into the low 16 bits of `vdst`. + // The high 16 bits are the tied-output input (`vdst_in`) and must be + // preserved explicitly in IR; this is not a plain 32-bit f32 write. + V_FMA_MIXLO_BF16, + V_ADD_F16, V_MUL_F16, V_SUB_F16, V_SUBREV_F16, V_MAC_F16, V_FMAC_F16, + // VOP2 F16 multiply-add-with-literal pseudos (mirror of + // V_FMAMK_F32 / V_FMAAK_F32 for the f16 lane). Defined in + // VOP2Instructions.td:1206-1210 — both take a 16-bit constant K + // alongside two F16 sources and lower to llvm.fma.f16: + // v_madmk_f16 dst, src0, K, src2 -> dst = src0 * K + src2 + // v_madak_f16 dst, src0, src1, K -> dst = src0 * src1 + K + // Note: hardware uses the legacy "mad" name, but the lowered + // semantics are fused-multiply-add (no rounding of the intermediate + // product), matching the F32 FMAMK/FMAAK convention. + V_MADMK_F16, V_MADAK_F16, + V_MAX_F16, V_MIN_F16, V_LDEXP_F16, V_FLOOR_F16, V_CVT_F16_U16, V_CVT_U16_F16, + V_ASHRREV_I16, V_LSHRREV_B16, V_LSHLREV_B16, + V_MAX_U16, V_MIN_U16, V_MAX_I16, V_MIN_I16, + // 16-bit integer arith (gfx8+, VOP2Instructions.td). Plain i16 + // add/sub/subrev with wrapping overflow (no carry-out — distinct + // from the rarely-used v_add_co_u16). v_mul_lo_u16 returns the low + // 16 bits of the multiply, naturally produced by `mul i16`. + V_ADD_U16, V_SUB_U16, V_SUBREV_U16, V_MUL_LO_U16, + V_DOT2C_I32_I16, V_DOT4C_I32_I8, V_DOT8C_I32_I4, + V_PK_FMAC_F16, + V_PACK_B32_F16, + V_CVT_PK_BF16_F32, V_CVT_PK_BF8_F32, V_CVT_PK_FP8_F32, + V_CVT_PKRTZ_F16_F32, V_CVT_PK_F16_F32, + V_CVT_SCALEF32_PK_FP4_F32, + V_BFM_B32, + + // -- VOP2/VOP3 FP64 -- + V_ADD_F64, V_MUL_F64, V_FMA_F64, V_FMAC_F64, + // VOP1 FP64. v_rcp_f64 is a TRANS-class transcendental (see + // VOP1Instructions.td: `let TRANS = 1, SchedRW = [WriteTrans64]`), + // not a true reciprocal — hardware returns a ~26-bit accurate + // approximation that the LLVM `int_amdgcn_rcp` intrinsic models + // exactly. We deliberately lift to that intrinsic rather than to a + // generic `fdiv 1.0, x` because (a) gfx942 isels the intrinsic + // straight back to v_rcp_f64 (no Newton-Raphson refinement is + // emitted), and (b) `fdiv` would lower to a software divide + // sequence on gfx942 unless `arcp`/fast-math flags are set, which + // would be a silent semantics change versus the source op. + V_RCP_F64, + + V_MAX_U32, V_MIN_U32, V_MAX_I32, V_MIN_I32, + V_PERMLANE16_B32, V_PERMLANEX16_B32, V_PERMLANE64_B32, + V_PERMLANE16_SWAP_B32, V_PERMLANE32_SWAP_B32, + + // -- VOPC (V_CMP_* and V_CMPX_*) -- + // + // All ~100 V_CMP_*_{U,I,F}{16,32,64} and V_CMPX_*_{U,I,F}{16,32} pseudos + // collapse onto these two CanonicalOps; the actual {predicate, element type, + // width} triple is looked up from `VCmpMeta` keyed on the MC opcode. + // `V_CMP` writes an SGPR pair (or VCC, depending on the encoding). + // `V_CMPX` additionally ANDs the compare result into EXEC. + V_CMP, V_CMPX, + + // -- VOP3P -- + V_PK_ADD_F32, V_PK_MUL_F32, V_PK_FMA_F32, + V_PK_MAX_F32, V_PK_MIN_F32, V_PK_MOV_B32, + + // VOP3P packed-pair `<2 x i16>` int ops (gfx9+, available on both + // gfx942 and gfx1250 — same MC encoding family). Operand profile is + // VOP_V2I16_V2I16_V2I16: 32-bit dst / 32-bit src0 / 32-bit src1, each + // bitcast to `<2 x i16>` before the op and back to i32 for the + // write-back. Inline literals encode a packed `<2 x i16>` directly + // (lo i16 = bits[15:0], hi i16 = bits[31:16]) — NO broadcast + // analogue to the V_PK_F32 32-bit-element family, because the + // literal width matches the operand width here. + // + // V_PK_ADD_U16: dst = src0 + src1 (lane-wise i16 add) + // V_PK_LSHLREV_B16: dst = src1 << (src0 & 15) (clshl_rev_16 + // SDAG: shift count is src0, value is src1, low 4 + // bits of the count select the shift amount per + // AMDGPU's hardware-clamp-to-element-width). + // + // op_sel / op_sel_hi modifiers select which i16 of each source feeds + // each output lane (defaults: op_sel=[0,0,0], op_sel_hi=[1,1,1] — + // natural lo->lo, hi->hi packing). + // + // Sibling V_PK_LSHRREV_B16 / V_PK_ASHRREV_I16 share the same handler + // shape (only the IR opcode differs: lshr / ashr); they are NOT + // enumerated here because the kerneldex corpus has zero producers + // for them today and adding them speculatively would violate the + // "no fallback / design what the corpus exercises" discipline. + V_PK_ADD_U16, V_PK_LSHLREV_B16, + + V_BITOP3_B32, V_BITOP3_B16, + + // GFX9 VOP3-only v_add/sub_i32 — plain add/sub when clamp=0, + // saddsat/ssubsat when clamp=1. + V_ADD_I32, V_SUB_I32, + + // -- 64-bit vector ops -- + V_LSHLREV_B64, + // gfx8+ VOP3 64-bit shifts. Same operand shape as V_LSHLREV_B64 + // (i64 dst, i32 shamt, i64 src1, reversed-operand convention: + // `dst = src1 >> shamt`). Lower to LLVM `lshr` (logical right) and + // `ashr` (arithmetic right) on the i64 src1, with the i32 shamt + // zext'd to i64 — the AMDGPU hardware masks the count to 6 bits so + // the LLVM behaviour matches as long as we feed a valid i32 (LLVM + // shifts >= bitwidth are poison, the hardware masks; we don't paper + // over the difference because corpus shifts always carry a finite + // immediate or a producer that already masks). + V_LSHRREV_B64, V_ASHRREV_I64, + V_LSHL_ADD_U64, V_ADD_NC_U64, V_SUB_NC_U64, + // gfx1250 VOP3 64-bit integer min/max. These are pure per-lane + // compare-and-select operations: signed forms use i64 ordering, unsigned + // forms use u64 ordering. They do not consult MODE and have no NaN, + // signed-zero, denorm/FTZ, or rounding behaviour. + V_MAX_I64, V_MAX_U64, V_MIN_I64, V_MIN_U64, + // gfx1250 VOP2 64-bit unsigned multiply (low 64 bits of s0 * s1). + V_MUL_U64, + V_MAD_U64_U32, V_MAD_CO_U64_U32, + // gfx1250 no-carry 64-bit multiply-add VOP3 opcodes (VOP3Only_Realtriple_gfx1250, + // VOP3Instructions.td:2129 / 2130: encodings 0x2fa / 0x2fb). Both widen + // two 32-bit sources into a 64-bit accumulator: + // V_MAD_NC_U64_U32: D.u64 = zext(S0.u32)*zext(S1.u32) + S2.u64 + // V_MAD_NC_I64_I32: D.i64 = sext(S0.i32)*sext(S1.i32) + S2.i64 + // Neither produces a carry/overflow output (hence the "nc" suffix). The + // backend's AMDGPUISelDAGToDAG.cpp::SelectMad64_32 pattern-matches the + // canonical `add(mul(zext/sext s0, zext/sext s1), s2_i64)` IR we emit + // back into v_mad_(nc|co|_i64_i32) on whichever target the raise writes + // to — identical to how V_MAD_U64_U32 lowers today (see handle_valu.cpp + // v_mad_u64_u32 arm and opcode_map.cpp's "LLVM no longer exposes a + // distinct carry-out variant" comment for historical context). + V_MAD_NC_U64_U32, V_MAD_NC_I64_I32, + + // -- FLAT / GLOBAL / SCRATCH memory -- + FLAT_LOAD_UBYTE, FLAT_LOAD_SBYTE, FLAT_LOAD_USHORT, FLAT_LOAD_SSHORT, + FLAT_LOAD_DWORD, FLAT_LOAD_DWORDX2, FLAT_LOAD_DWORDX3, FLAT_LOAD_DWORDX4, + FLAT_STORE_BYTE, FLAT_STORE_SHORT, FLAT_STORE_SHORT_D16_HI, + FLAT_STORE_DWORD, FLAT_STORE_DWORDX2, FLAT_STORE_DWORDX3, FLAT_STORE_DWORDX4, + GLOBAL_LOAD_UBYTE, GLOBAL_LOAD_SBYTE, GLOBAL_LOAD_USHORT, GLOBAL_LOAD_SSHORT, + GLOBAL_LOAD_SHORT_D16_HI, + GLOBAL_LOAD_DWORD, GLOBAL_LOAD_DWORDX2, GLOBAL_LOAD_DWORDX3, GLOBAL_LOAD_DWORDX4, + GLOBAL_STORE_BYTE, GLOBAL_STORE_SHORT, GLOBAL_STORE_SHORT_D16_HI, + GLOBAL_STORE_DWORD, GLOBAL_STORE_DWORDX2, GLOBAL_STORE_DWORDX3, GLOBAL_STORE_DWORDX4, + SCRATCH_LOAD_DWORD, SCRATCH_LOAD_DWORDX2, SCRATCH_LOAD_DWORDX3, SCRATCH_LOAD_DWORDX4, + SCRATCH_STORE_DWORD, SCRATCH_STORE_DWORDX2, SCRATCH_STORE_DWORDX3, SCRATCH_STORE_DWORDX4, + + // -- FLAT atomics -- + FLAT_ATOMIC_ADD, FLAT_ATOMIC_SUB, + FLAT_ATOMIC_AND, FLAT_ATOMIC_OR, FLAT_ATOMIC_XOR, + FLAT_ATOMIC_SMIN, FLAT_ATOMIC_SMAX, FLAT_ATOMIC_UMIN, FLAT_ATOMIC_UMAX, + FLAT_ATOMIC_SWAP, FLAT_ATOMIC_CMPSWAP, + FLAT_ATOMIC_ADD_F32, + + // -- GLOBAL atomics -- + GLOBAL_ATOMIC_ADD, GLOBAL_ATOMIC_SUB, + GLOBAL_ATOMIC_AND, GLOBAL_ATOMIC_OR, GLOBAL_ATOMIC_XOR, + GLOBAL_ATOMIC_SMIN, GLOBAL_ATOMIC_SMAX, GLOBAL_ATOMIC_UMIN, GLOBAL_ATOMIC_UMAX, + GLOBAL_ATOMIC_SWAP, GLOBAL_ATOMIC_CMPSWAP, + GLOBAL_ATOMIC_ADD_F32, + GLOBAL_ATOMIC_PK_ADD_BF16, GLOBAL_ATOMIC_PK_ADD_F16, + + // -- SMEM atomics -- + // gfx8+ scalar-cache atomics. Lifted to `atomicrmw` IR via handle_smem.cpp; + // the SCOPE/GLC bits fold into AtomicOrdering (monotonic) and whether the + // return-value slot is written back. + // + // S_ATOMIC_DEC has wrap-at-zero semantics that do NOT match a plain + // `atomicrmw sub` — the hardware computes + // new = (old == 0 || old > src) ? src : old - 1 + // which is exactly LLVM's `AtomicRMWInst::UDecWrap` binop (landed in + // LLVM 19). The canonical split-k "last workgroup runs the epilogue" + // barrier counter is the overwhelming corpus use (every AITER + // `bf16gemm_*_splitk_clean.co` kernel), keyed on whether the returned + // pre-decrement value equals 1. Like S_ATOMIC_SWAP this op is classed + // NonCommutative for the Class-3 wave-size obstruction classifier in + // wave_size_obstruction.cpp (a lane-id-derived decrement sequence's + // outcome is replica-order-dependent under modulo-replication). + S_ATOMIC_SWAP, + S_ATOMIC_DEC, + + // -- DS -- + DS_LOAD_TR16_B128, + DS_READ_B64_TR_B16, + DS_READ_B64_TR_B8, + // gfx1250 spelling of the same 64-bit transposed LDS load that + // gfx950 disassembles as `ds_read_b64_tr_b8`. The hardware + // semantics are identical: each lane reads 64 bits (8 x i8) from + // its LDS base, then the data is transposed across 8-lane groups + // so each lane post-transpose holds 8 i8 values from 8 different + // source lanes at the same intra-group element offset (v2i32 + // packed). The two CanonicalOps are kept distinct because they are two + // distinct LLVM MC opcodes (DS_LOAD_TR8_B64 vs DS_READ_B64_TR_B8) + // with separate isel patterns and separate intrinsics + // (`int_amdgcn_ds_load_tr8_b64` gated isGFX1250Plus, + // `int_amdgcn_ds_read_tr8_b64` gated HasGFX950Insts); both lower + // through the same hand-rolled bpermute-based emulation in + // handle_ds.cpp because gfx942 (the transpiler's target ISA) has + // neither isel pattern and no in-tree pre-isel emulation. + DS_LOAD_TR8_B64, + DS_READ_B32, DS_READ_B64, + // 96-bit (3 x i32) LDS load. LLVM MC opcode `DS_READ_B96`; gfx11+ + // (gfx1100/gfx1200/gfx1250) renames the asm spelling to + // `ds_load_b96` (DSInstructions.td:1578 declares + // `defm DS_READ_B96 : DS_Real_gfx11_gfx12_gfx13<0x0fe, + // "ds_load_b96">`). Hardware reads 96 bits from the lane's LDS + // base; the lift is `load <3 x i32>` from addrspace(3). The + // gfx942 backend lowers the 3-dword vector load to either a + // native `ds_read_b96` (gfx9 inherits the `_vi` Real form) or + // splits it into 3x `ds_read_b32` with the appropriate + // increments — both are correct in-place lowerings. + // Inserted between DS_READ_B64 and DS_READ_B128 deliberately so + // the existing range checks (`sop >= DS_READ_B32 && + // sop <= DS_READ_I8` for reads, parallel for writes) continue to + // cover it without a special case. + DS_READ_B96, + DS_READ_B128, + DS_READ2_B32, DS_READ2_B64, + // gfx11+ stride-64 two-address LDS load forms + // (DSInstructions.td:1529,1542 — `ds_load_2addr_stride64_b{32,64}`). + // Semantics parallel DS_READ2_B{32,64}, but the per-access byte + // offset is `rawFieldValue * 256` (B32) or `* 512` (B64) instead of + // `* 4` / `* 8`, extending reach with the same 8-bit offset field + // at the cost of a 64-dword stride granularity. Handled jointly + // with the non-ST64 variants in handle_ds.cpp's dedicated + // READ2/WRITE2 block; placed adjacent in the enum so the existing + // `sop >= DS_READ_B32 && sop <= DS_READ_I8` range check continues + // to classify them as DS reads (the dedicated block intercepts + // before the single-offset generic handler ever sees them). + DS_READ2ST64_B32, DS_READ2ST64_B64, + DS_READ_U16, DS_READ_I16, DS_READ_U8, DS_READ_I8, + DS_WRITE_B32, DS_WRITE_B64, + // Symmetric write-side for `ds_load_b96`: gfx11+ asm spelling is + // `ds_store_b96` (DSInstructions.td:1576); the LLVM MC opcode + // remains `DS_WRITE_B96`. Lift is `store <3 x i32>` to + // addrspace(3). Inserted between DS_WRITE_B64 and DS_WRITE_B128 + // for the same range-check reason as DS_READ_B96 above. + DS_WRITE_B96, + DS_WRITE_B128, + DS_WRITE2_B32, DS_WRITE2_B64, + // gfx11+ stride-64 two-address LDS store forms (mirror the + // DS_READ2ST64 block above; see the read-side comment for the + // offset-scaling rationale and enum-placement reasoning). + DS_WRITE2ST64_B32, DS_WRITE2ST64_B64, + DS_WRITE_B16, DS_WRITE_B8, + // D16_HI partial-store family (gfx8+ HasD16LoadStore): + // store the upper 16 bits (B16_D16_HI) or bits [23:16] (B8_D16_HI) + // of the source VGPR to LDS. The "D16_HI" suffix names the + // *source* register half being stored, not a dest-merge — these + // are write-only and there is no tied dest_in operand. The + // companion D16 reads (DS_READ_U/I8_D16{,_HI}, DS_READ_U16_D16{,_HI}) + // are not yet on the worklist; if they surface, add them here as + // a separate set with their own tied-source dest_in handling. + DS_WRITE_B16_D16_HI, DS_WRITE_B8_D16_HI, + DS_BPERMUTE_B32, + // Class 2 DsSwizzle (hotswap/docs/wave-size-translation.md §6). + // Wave-width-specific cross-lane shuffle. The handler refuses with + // `unsupportedShape` until the P6 rewrite (lift through + // llvm.amdgcn.ds.swizzle — see wave-size-translation.md §5.3 row + // P6) lands; the wave-size classifier + // (wave_size_obstruction.cpp) flags it before the handler is even + // dispatched in the cross-wave case. + DS_SWIZZLE_B32, + + // -- MUBUF -- + BUFFER_LOAD_DWORD, BUFFER_LOAD_DWORDX2, BUFFER_LOAD_DWORDX3, BUFFER_LOAD_DWORDX4, + BUFFER_LOAD_UBYTE, BUFFER_LOAD_SBYTE, BUFFER_LOAD_USHORT, BUFFER_LOAD_SSHORT, + BUFFER_LOAD_SHORT_D16, BUFFER_LOAD_SHORT_D16_HI, + // D16 byte variants — gfx9+ partial-write loads. The 8-bit datum is + // sign- or zero-extended to i16 and merged into the lo (`_D16`) or + // hi (`_D16_HI`) half of the destination VGPR; the other 16 bits + // are preserved (BUFInstructions.td:1155-1169, predicate + // `D16PreservesUnusedBits`). Mnemonic on gfx11+/gfx1250 is + // `buffer_load_d16_u8` / `_d16_i8` / `_d16_hi_u8` / `_d16_hi_i8`. + BUFFER_LOAD_UBYTE_D16, BUFFER_LOAD_UBYTE_D16_HI, + BUFFER_LOAD_SBYTE_D16, BUFFER_LOAD_SBYTE_D16_HI, + BUFFER_LOAD_DWORD_LDS, BUFFER_LOAD_DWORDX2_LDS, + BUFFER_LOAD_DWORDX4_LDS, BUFFER_STORE_DWORDX4_LDS, + BUFFER_STORE_DWORD, BUFFER_STORE_DWORDX2, BUFFER_STORE_DWORDX3, BUFFER_STORE_DWORDX4, + BUFFER_STORE_BYTE, BUFFER_STORE_SHORT, + + // -- MUBUF atomics -- + // Order is significant: handle_mubuf.cpp dispatches via the range + // check `[BUFFER_ATOMIC_ADD, BUFFER_ATOMIC_PK_ADD_F16]`. New + // BUFFER_ATOMIC_* CanonicalOps must stay inside this range so the range + // check picks them up; entries the handler does not explicitly + // case-match are caught by the switch's default branch with a + // `RaiseFailure::unsupportedShape("unsupported buffer atomic")`. + BUFFER_ATOMIC_ADD, BUFFER_ATOMIC_SUB, + BUFFER_ATOMIC_AND, BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, + // Class 3 non-commutative atomics (NonCommutativeAtomic), see + // hotswap/docs/wave-size-translation.md §6. + // The wave-size classifier flags these in the cross-wave case; + // handle_mubuf.cpp models them with raw-buffer atomics so same-wave + // and same-target lifts preserve descriptor-relative addressing. + BUFFER_ATOMIC_SWAP, BUFFER_ATOMIC_CMPSWAP, + BUFFER_ATOMIC_ADD_F32, + BUFFER_ATOMIC_PK_ADD_BF16, BUFFER_ATOMIC_PK_ADD_F16, + + // -- MFMA -- + // gfx950 scaled F8F6F4 variants share a per-shape intrinsic but take 9 + // src-format sub-variants each; those are collapsed onto these four CanonicalOps + // in kCanonTable. + V_MFMA_F32_16x16x128_F8F6F4, V_MFMA_SCALE_F32_16x16x128_F8F6F4, + V_MFMA_F32_32x32x64_F8F6F4, V_MFMA_SCALE_F32_32x32x64_F8F6F4, + // F32 <- F16/F32 (gfx908+). Each covers its pseudo's _e64/_vgprcd_/_mac_ + // variants via pseudoAlias stripping in OpcodeMap::canonicalize. + V_MFMA_F32_16x16x16_F16, V_MFMA_F32_32x32x8_F16, + V_MFMA_F32_16x16x4_F32, V_MFMA_F32_32x32x1_F32, V_MFMA_F32_32x32x2_F32, + V_MFMA_F32_4x4x1_F32, V_MFMA_F32_16x16x1_F32, + V_MFMA_F32_32x32x4_F16, V_MFMA_F32_16x16x4_F16, V_MFMA_F32_4x4x4_F16, + // I32 <- I8. + V_MFMA_I32_16x16x32_I8, V_MFMA_I32_32x32x16_I8, + V_MFMA_I32_32x32x4_I8, V_MFMA_I32_16x16x4_I8, V_MFMA_I32_4x4x4_I8, + // F32 <- XF32 (gfx940+). + V_MFMA_F32_16x16x8_XF32, V_MFMA_F32_32x32x4_XF32, + // F32 <- BF16 (gfx908 2-byte variants). + V_MFMA_F32_32x32x2_BF16, V_MFMA_F32_16x16x2_BF16, V_MFMA_F32_4x4x2_BF16, + // F32 <- BF16 "1K" shapes (gfx90a+). + V_MFMA_F32_16x16x16_BF16_1K, V_MFMA_F32_32x32x8_BF16_1K, + // F32 <- BF16/F16 wide shapes (gfx950). + V_MFMA_F32_16x16x32_BF16, V_MFMA_F32_32x32x16_BF16, + V_MFMA_F32_16x16x32_F16, + // F32 <- FP8/BF8 (gfx940+). + V_MFMA_F32_16x16x32_FP8_FP8, V_MFMA_F32_16x16x32_FP8_BF8, + V_MFMA_F32_16x16x32_BF8_FP8, V_MFMA_F32_16x16x32_BF8_BF8, + V_MFMA_F32_32x32x16_FP8_FP8, V_MFMA_F32_32x32x16_FP8_BF8, + V_MFMA_F32_32x32x16_BF8_FP8, V_MFMA_F32_32x32x16_BF8_BF8, + + // -- WMMA (gfx1250) -- + // 16x16x32 WMMA with f32 accumulator and 16-bit element types. Both + // share the same per-lane fragment shape (A,B: <16 x t>, C/D: + // <8 x f32>) and same K-decomposition path through the gfx942 MFMA + // lowering — `emitWMMAtoMFMA` is parameterised on input element + // type and routes to the matching CDNA3 MFMA intrinsic + // (mfma_f32_16x16x16f16 vs mfma_f32_16x16x16bf16_1k). + V_WMMA_F32_16x16x32_F16, + V_WMMA_F32_16x16x32_BF16, + // 16x16x4 WMMA with f32 accumulator and 32-bit f32 element types + // for both A and B (gfx1250 RDNA4 VOP3P opcode 0x05D). Per-Wave32- + // lane fragment shape is A,B: <2 x f32> (only 4 K-elements split + // across 2 dwords per lane), C/D: <8 x f32>; this is structurally + // distinct from the 16-bit (K=32, A/B = <16 x t>) and 8-bit + // (K=64, A/B = <8 x i32>) families above and so does NOT share + // the `emitWMMAtoMFMA` decomposition (which is parameterised on + // 16-/8-bit element packing, not f32). The native intrinsic + // `amdgcn_wmma_f32_16x16x4_f32` is declared inside + // `AMDGPUWMMAIntrinsicsGFX1250` (gated by `isGFX125xOnly` in + // IntrinsicsAMDGPU.td:4113-4114) and is NOT part of the gfx12 + // RDNA4-base WMMA family (`AMDGPUWMMAIntrinsicsGFX12`, + // FeatureWMMA{128,256}bInsts), so the same-target lift gates on + // `ISAProfile::hasTensorOps` (FeatureGFX1250Insts) — matching + // the LLVM intrinsic's actual subtarget gating — rather than + // `hasWMMA12`. Call shape is `AMDGPUWmmaIntrinsicModsAllReuse`, + // 8 args: `(A_mod, A, B_mod, B, C_mod, C, reuse_a, reuse_b)`. + // Cross-target lift to gfx942 would need a new K=4 MFMA + // decomposition path (gfx942 has `mfma_f32_16x16x4f32`) that no + // kernel in the current corpus exercises, so we refuse loudly + // via `RaiseFailure::unsupportedShape` to surface the gap + // immediately rather than silently degrade. + V_WMMA_F32_16x16x4_F32, + // 16x16x64 WMMA with f32 accumulator and 8-bit element types + // (fp8/bf8). The four AB combinations are distinct opcodes (and + // distinct CDNA3 MFMA intrinsics on gfx942) but share the same + // per-lane fragment shape (A,B: <8 x i32> = 32 fp8/bf8 bytes per + // Wave32 lane, C/D: <8 x f32>) and the same gfx942 MFMA decomposition + // path through `emitWMMAtoMFMA`. The K=64 dimension splits into + // 2 chained K=32 MFMAs per Wave32 group, mirroring the K=32→2×K=16 + // split used for the 16-bit variants. The lane-redistribution math + // is byte-identical between the two K-families (32 bytes per lane + // either way), so the only divergence inside `emitWMMAtoMFMA` is the + // per-MFMA pack type (i64 vs <4 x half|i16>) and the dispatched + // intrinsic ID. See `WMMAInputType` in `wmma_lowering.h` for the + // full enumeration. + V_WMMA_F32_16x16x64_FP8_FP8, + V_WMMA_F32_16x16x64_FP8_BF8, + V_WMMA_F32_16x16x64_BF8_FP8, + V_WMMA_F32_16x16x64_BF8_BF8, + // 16x16x64 WMMA with i32 accumulator and unsigned/signed 8-bit + // integer inputs (the gfx1250 IU8 variant; the LLVM intrinsic + // uses `iu8` to denote that the per-input sign extension is + // selected at call site through the `neg_lo` modifier rather + // than the opcode itself). Per-Wave32-lane fragment shape is + // identical to the FP8 sibling (A,B: <8 x i32> = 32 packed i8 + // bytes per lane, C/D: <8 x i32> for integer accumulator). On + // gfx942 we lower through the same `emitWMMAtoMFMA` helper, + // dispatching the per-MFMA call to `mfma_i32_16x16x32_i8` + // (i64 packed A/B, <4 x i32> accumulator). The handler must + // also use a different native-WMMA12 intrinsic shape on gfx12 + // hardware: `AMDGPUWmmaIntrinsicModsABClamp` (8 args including + // a trailing clamp flag), distinct from the 16-bit AllReuse + // and the 8-bit FP8 ModsC shapes. + V_WMMA_I32_16x16x64_IU8, + + // 16x16x128 WMMA with f32 accumulator and per-matrix scale exponents, + // f8f6f4 mantissa-format family (gfx1250 RDNA4 VOP3P opcode 0x033 in + // VOP3PX2 form, pseudo `V_WMMA_SCALE_F32_16X16X128_F8F6F4_*_w32_*`). + // Each kernel encodes one of 9 opcode-suffix mantissa-pair variants + // (`{f4,f6,f8} × {f4,f6,f8}`), but the in-family element format + // (BF8 vs FP8 within f8; BF6 vs FP6 within f6) is selected at runtime + // by the `matrix_a_fmt` / `matrix_b_fmt` named-immediate operands + // (`enum MatrixFMT { FP8=0, BF8=1, FP6=2, BF6=3, FP4=4 }`, + // SIDefines.h:1052-1058). Per-Wave32-lane fragment shape is therefore + // format-dependent: A is `<16 x i32>` for f8 (32 packed bytes/lane), + // `<12 x i32>` for f6 (24 packed bytes/lane), and `<8 x i32>` for f4 + // (16 packed bytes/lane); B is independently `<16/12/8 x i32>` per + // its own format. C/D is `<8 x f32>`. We collapse all 18 MC pseudos + // (9 mantissa pairs × `_twoaddr` / `_threeaddr`) onto this single + // CanonicalOp and discriminate at the handler with `getNamedOperandIdx`, + // mirroring the F8F6F4 MFMA collapse rule in `kCanonTable`. + // + // The native intrinsic `int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4` + // (IntrinsicsAMDGPU.td:4138, class `AMDGPUWmmaScaleIntrinsicModsC + // `) takes 14 arguments: + // (i32 matrix_a_fmt, A, i32 matrix_b_fmt, B, + // i16 C_modifiers, <8 x f32> C, + // i32 matrix_a_scale, i32 matrix_a_scale_fmt, i32 scale_src0, + // i32 matrix_b_scale, i32 matrix_b_scale_fmt, i32 scale_src1, + // i1 matrix_a_reuse, i1 matrix_b_reuse) + // and is gated by `isGFX125xOnly` inside `AMDGPUWMMAIntrinsicsGFX1250` + // (IntrinsicsAMDGPU.td:4113). Handler decodes operands by name via + // `AMDGPU::getNamedOperandIdx` (`matrix_a_fmt`, `matrix_b_fmt`, + // `matrix_a_scale`, `matrix_b_scale`, `matrix_a_scale_fmt`, + // `matrix_b_scale_fmt`, `scale_src0`, `scale_src1`, `matrix_a_reuse`, + // `matrix_b_reuse`, `src2_modifiers`) so any future TableGen operand + // reshuffle flows in for free. + // + // === Same-target gfx1250 → gfx1250 contract === + // + // Lift directly to `int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4` with + // overloaded {<8 x f32>, <16 x i32>, <16 x i32>} type arguments + // (the overload widths match the f8 family used by the corpus + // kernels; the matrix_a_fmt / matrix_b_fmt immediates carry the + // BF8 vs FP8 distinction). The call is NOT wrapped in + // `emitUnderExec` because the WMMA intrinsic is `IntrConvergent` + // and operates on the wave's matrix fragment, not per-lane + // divergent values — wrapping would emit one matrix multiply per + // active lane. + // + // === Cross-target (gfx942 and earlier) contract === + // + // gfx942 has no scaled-WMMA hardware. The closest sibling on gfx942 + // is `mfma_scale_f32_16x16x128_f8f6f4` (already mapped via + // `V_MFMA_SCALE_F32_16x16x128_F8F6F4`), but the WMMA-to-MFMA lane + // redistribution for K=128 + per-matrix-fmt selection + the + // matrix_a/b_scale_fmt × scale_src0/src1 exponent application is + // not modelled in `wmma_lowering.cpp` (only K=32 / K=64 fp16/bf16/ + // fp8/bf8/iu8 paths exist). Per the user-rules (no silent + // fallbacks) and consistent with the gfx1250-only refusal contract + // applied to `V_WMMA_F32_16x16x4_F32` above, we refuse loudly via + // `RaiseFailure::unsupportedShape` to surface both the cross-target + // capability gap and the missing scaled-WMMA decomposition path. + V_WMMA_SCALE_F32_16x16x128_F8F6F4, + + // -- VOPD -- (handled via string parsing of fullText, not opcode) + VOPD_GENERIC, + + // -- VIMAGE TENSOR (gfx1250-only) -- + // Tensor descriptor memory ops driven by the gfx1250 TENSOR cnt unit + // (`MIMGInstructions.td:2049-2113`, `VIMAGE_TENSOR_Pseudo`, + // `let SubtargetPredicate = isGFX125xOnly`). Each opcode encodes + // up to four 128-/256-bit Tensor Descriptors (`D# group 0..3`), + // a `R128A16:$r128` flag, and a `CPol:$cpol` cachepolicy immediate. + // `_d2` is the up-to-2D form (passes NULL for D# group 2/3); `_d4` + // is the up-to-4D form. Both share the same CanonicalOp here because + // their semantic intent is identical and their refusal contract is + // identical too — the handler `handleVIMAGE` discriminates on + // `di.mnemonic` only when shape differentiation matters (e.g., + // a future native-target intrinsic-emit path that fills the + // 0-init D# operands for `_d2`). + // + // gfx942 has no equivalent hardware unit. The handler refuses + // loudly via `RaiseFailure::unsupportedShape` with a precise + // diagnostic explaining the cross-target gap, in line with the + // user-rules (no silent fallbacks). The matching LLVM intrinsics + // are `int_amdgcn_tensor_load_to_lds` / + // `int_amdgcn_tensor_store_from_lds` (IntrinsicsAMDGPU.td:4213). + TENSOR_LOAD_TO_LDS, + TENSOR_STORE_FROM_LDS, + + // -- gfx1250 async global → LDS load -- + // + // FLAT async global-to-LDS load, four widths per the b8 / b32 / b64 / + // b128 family. Each width has both a plain VGPR_64 vaddr form and a + // SADDR (SReg_64 base + VGPR_32 vaddr offset) form, both of which + // collapse to the same CanonicalOp per width; `handleFLAT` discriminates + // shape on `op.nSrcs()` exactly the same way `tensor_load_to_lds` + // discriminates `_d2` vs `_d4`. The pseudo InOperandList is + // documented in `FLATInstructions.td:391-417` + // (`FLAT_Global_Load_LDS_Pseudo<…, IsAsync=1>`): + // + // plain : (vdst:VGPR_32, vaddr:VGPR_64, offset, cpol) + // SADDR : (vdst:VGPR_32, saddr:SReg_64, vaddr:VGPR_32, offset, cpol) + // + // `vdst` here is the per-lane LDS i32 OFFSET (TableGen `vdst` slot + // is in the *input* list because `IsAsync=1` enables `has_vdst`), + // not a written register: each lane uses its own VGPR_32 value as + // the LDS-base address for the burst write. The intrinsics + // `int_amdgcn_global_load_async_to_lds_b{8,32,64,128}` + // (IntrinsicsAMDGPU.td:3939-3946) all share signature + // `AMDGPUAsyncGlobalLoadToLDS` (line 3904) and take the LDS + // pointer as the second operand (`local_ptr_ty`); we materialise + // it via `inttoptr i32 -> ptr addrspace(3)` from the per-lane + // VGPR_32. The width is encoded only in the intrinsic ID — the + // operand bank is identical across all four widths. + // + // Separate CanonicalOps per width (rather than a single + // `GLOBAL_LOAD_ASYNC_TO_LDS_BX` discriminated by mnemonic) so the + // CanonicalOp ↔ intrinsic mapping is direct and the handler is a small + // switch instead of string parsing — the canonical opcode_map + // collapses each `_gfx1250` real onto its width-specific pseudo. + // + // === Same-target gfx1250 → gfx1250 contract === + // + // gfx1250 has the asynccnt unit and the native intrinsic; the + // handler emits a direct call inside an `emitUnderExec` diamond + // (per-lane operation: each lane fires its own LDS write, inactive + // lanes do not). `IntrInaccessibleMemOrArgMemOnly` on the + // intrinsic prevents downstream passes from CSEing or reordering + // the asynchronous fetch across other memory sites — the + // user-visible barrier semantics live in companion + // `s_wait_asynccnt` instructions, not in this op. The intrinsic's + // `offset` immediate corresponds to the FLAT instruction's + // `flat_offset` slot; `cpol` is the gfx12+ cachepolicy bitfield + // (th, scope) carried as the trailing immediate. + // + // === Cross-target (gfx942 and earlier) contract === + // + // The asynccnt unit and `int_amdgcn_global_load_async_to_lds_b*` + // are gfx1250-only (`SubtargetPredicate = isGFX1250Plus` on the + // VFLAT reals, `FeatureGFX1250Insts`). gfx942 has no asynchronous + // global→LDS DMA channel and no equivalent burst path. + // + // We emit a **synchronous per-lane emulation**: for each active + // lane, `load , ptr addrspace(1) %gptr` followed by + // `store , ptr addrspace(3) %lptr`, width `T` chosen per the + // b8 / b32 / b64 / b128 CanonicalOp. The source ISA pragma + // (`instruction_manual.pdf §13.6.{9,10,11,12}`, verbatim): + // + // pragma "vector" do + // dsaddr = LDS_BASE.b32 + VGPR[laneId][VDST.u32] + INST_OFFSET.b32; + // memaddr = ADDR; // CalcGlobalAddr(VADDR, SADDR, IOFFSET) + // LDS[dsaddr].bN = MEM[memaddr].bN // (N = 8/32/64/128) + // endpragma + // + // — a per-lane global→LDS copy, width-parametric, identical in + // every respect to what the synchronous `load` + `store` pair + // produces *per lane*. `INST_OFFSET` applies to BOTH the LDS + // address and the global address (confirmed by the explicit + // appearance in the `dsaddr` expression and by `CalcGlobalAddr` + // folding `IOFFSET` into `memaddr`); the emulation folds it onto + // both pointers via `i8`-GEP before the `load`/`store`, matching + // the same-target intrinsic's immarg behaviour. + // + // === Documented semantic trade-off === + // + // The async intrinsic carries `IntrInaccessibleMemOrArgMemOnly` + // and the hardware schedules the DMA against a dedicated counter + // (`ASYNCcnt`; `programming_manual.pdf §4.9.9`). Completion is + // signalled via `S_WAIT_ASYNCCNT`. The same aggregate + // observable per-lane LDS state is produced by a synchronous + // `load`+`store` chain AFTER the corresponding `s_wait_asynccnt + // 0`; the ONLY information lost in the emulation is the + // **pipelining overlap** between in-flight async DMAs and + // unrelated VMEM / LDS operations in the wave's own stream. On + // gfx1250 the DMA unit can fire while the wave's ALU path runs; + // the synchronous emulation blocks the wave until the global + // `load` retires before the `store` publishes the data to LDS. + // + // This is a **throughput regression, not a correctness + // regression** — every lane's final LDS state is bit-identical + // to the async version's state observed after + // `s_wait_asynccnt 0`. Kernels that depend on *observable + // effects under a partially-elapsed asynccnt* (e.g., a + // hand-written pipeliner polling asynccnt state out of the wave's + // instruction stream — not a pattern LLVM IR can express anyway) + // are NOT in the GPT-OSS corpus and remain explicitly out of + // scope. The GPT-OSS MoE expert-GEMM kernels (`matmul_ogs_*`) + // use the async DMA as a compiler-scheduled prefetch-into-LDS + // whose only user-visible contract is "data lands in LDS before + // the subsequent `s_wait_asynccnt 0` + `ds_read_*` chain reads + // it" — which the synchronous emulation preserves exactly. + // + // Companion `S_WAIT_ASYNCCNT` CanonicalOp (declared above) lowers to a + // raiser-level no-op on the cross-target arm: the synchronous + // `load`+`store` pair has already completed by the time the wait + // is reached, so the native wait has nothing to track. Dataflow + // dependencies from the emulated `store` through subsequent LDS + // reads carry the happens-before the source kernel relied on, and + // the gfx942 backend re-inserts the `s_waitcnt lgkmcnt(0)` before + // the reader. On the same-target arm, `S_WAIT_ASYNCCNT` is still + // a no-op at the raiser (like every other wait counter, per + // `handle_sopp.cpp`): the intrinsic's + // `IntrInaccessibleMemOrArgMemOnly` annotation prevents reorder + // across the wait site, and the backend re-emits the native + // `s_wait_asynccnt` from that IR-level ordering. + // + // Rationale for preferring a documented emulation over the + // previous loud refusal: every `matmul_ogs_*` variant in the + // GPT-OSS MoE expert-GEMM surface (4 kernels, runtime-dominant + // in inference) hits this opcode and was blocked end-to-end. + // The `matmul_fp16` / `matmul_fp16_16x16` path — the shared MFMA + // fragment redistribution surface — already works; unblocking the + // MoE GEMM was the single highest-impact change. The trade-off + // is scoped and documented, not hidden, and matches the posture + // the sibling TDM axis takes in `sync-translation.md §10` ("TDM + // emulation lowers to synchronous buffer loads"). + GLOBAL_LOAD_ASYNC_TO_LDS_B8, + GLOBAL_LOAD_ASYNC_TO_LDS_B32, + GLOBAL_LOAD_ASYNC_TO_LDS_B64, + GLOBAL_LOAD_ASYNC_TO_LDS_B128, + + // -- gfx1250 VMEM prefetch (FLAT, hint-class) -- + // + // FLAT advisory prefetch on a per-lane (divergent) VGPR pointer. The + // family is gated by `FeatureVmemPrefInsts` (AMDGPU.td:283) and is + // currently only enabled inside `FeatureISAVersion12_50_Common` + // (AMDGPU.td:2092), i.e. gfx1250 / RDNA4 — no earlier ISA exposes a + // matching VMEM-prefetch encoding. The companion + // `int_amdgcn_global_prefetch` intrinsic (IntrinsicsAMDGPU.td:3211) + // takes a global-address-space pointer + an i32 cachepolicy immarg + // (bits[2:0]=th, bits[4:3]=scope) and is annotated with + // `IntrInaccessibleMemOrArgMemOnly + IntrWillReturn + NoCapture + + // IntrNoCallback + IntrNoFree`, so the SDAG models it as a memory + // intrinsic that may overlap the load lattice but never publishes a + // value — exactly the hint contract the hardware promises. + // + // Operand layout (FLAT_Prefetch_Pseudo, FLATInstructions.td:525-553): + // + // plain (3 srcs): vaddr:VGPR_64, offset, cpol + // SADDR (4 srcs): saddr:SReg_64, vaddr:VGPR_32, offset, cpol + // + // Note `has_vdst = 0`, so there is no destination and no LDS slot — + // distinct from the GLOBAL_LOAD_ASYNC_TO_LDS_B* family above. Width + // is fixed at 8 bytes (the `_b8` mnemonic refers to the request + // granularity, not data); a single CanonicalOp suffices for both + // addressing-mode variants because `handleFLAT` discriminates on + // `op.nSrcs()` exactly the same way the async-to-LDS family does. + // + // === Same-target gfx1250 → gfx1250 contract === + // + // Lift directly to `int_amdgcn_global_prefetch(globalPtr, cpol)`. + // The FLAT `flat_offset` is folded onto the address by GEP'ing + // `globalPtr += offset` before the call (the intrinsic itself + // takes no offset operand). The call sits OUTSIDE + // `emitUnderExec` because the intrinsic carries the EXEC mask + // implicitly through `IntrInaccessibleMemOrArgMemOnly` — wrapping + // it in a per-lane diamond would emit one prefetch per active + // lane, gratuitously inflating the IR for what the hardware + // executes as a single broadcast hint. + // + // === Cross-target (gfx942 and earlier) contract === + // + // `int_amdgcn_global_prefetch` is gated by `HasVmemPrefInsts` + // (FLATInstructions.td:1367) and has no isel coverage on + // gfx942 — emitting the call would compile-fail downstream. + // The closest sibling, `int_amdgcn_s_prefetch_data` + // (IntrinsicsAMDGPU.td:3188), requires a UNIFORM (SGPR) pointer, + // which we cannot prove for the divergent VGPR address used here + // without divergence analysis. Per the user-rules (no silent + // fallbacks) and consistent with the gfx1250-only refusal contract + // applied to GLOBAL_LOAD_ASYNC_TO_LDS_B* and TENSOR_LOAD_TO_LDS + // above, we refuse loudly via `RaiseFailure::unsupportedShape`. + // Triton's TDM-pipelined GEMM kernels schedule prefetches at + // specific points in their software pipeline and a silent drop on + // gfx942 would mask both the cross-target capability gap and any + // performance-tuning regression downstream. + GLOBAL_PREFETCH_B8, + + // -- AGPR -- + V_ACCVGPR_READ_B32, V_ACCVGPR_WRITE_B32, + + CanonicalOp_COUNT +}; + +// Stable human-readable identifier for a CanonicalOp (the enum's spelling, +// e.g. `"V_CMPX"` for `CanonicalOp::V_CMPX`). Used in diagnostics — prefer +// this over `(int)sop` so errors name the instruction class rather +// than a raw enum position that drifts with enum edits. +const char *canonicalOpName(CanonicalOp Op); + +} // namespace COMGR::hotswap + +#endif