From bdf0f574836e459c7e0dbfd54ee120307ad08fe4 Mon Sep 17 00:00:00 2001 From: QiLin Gai Date: Sat, 9 May 2026 18:54:47 +0800 Subject: [PATCH] [BACKEND] Init mthreads backend in version 3.6.x --- CMakeLists.txt | 16 +- python/setup_tools/utils/mthreads.py | 123 + setup.py | 2 - third_party/mthreads/CMakeLists.txt | 23 + third_party/mthreads/backend/__init__.py | 0 third_party/mthreads/backend/compiler.py | 933 +++ third_party/mthreads/backend/driver.c | 349 + third_party/mthreads/backend/driver.py | 895 +++ .../mthreads/backend/lib/libdevice.31.bc | Bin 0 -> 372556 bytes third_party/mthreads/bin/CMakeLists.txt | 57 + .../mthreads/bin/RegisterTritonDialects.h | 129 + third_party/mthreads/bin/triton-llvm-opt.cpp | 121 + third_party/mthreads/bin/triton-opt.cpp | 11 + third_party/mthreads/bin/triton-reduce.cpp | 11 + .../mthreads/bin/triton-tensor-layout.cpp | 237 + third_party/mthreads/include/CMakeLists.txt | 1 + .../mthreads/include/triton/Analysis/Alias.h | 96 + .../include/triton/Analysis/Allocation.h | 265 + .../include/triton/Analysis/AxisInfo.h | 271 + .../include/triton/Analysis/BufferRegion.h | 166 + .../mthreads/include/triton/Analysis/Membar.h | 263 + .../include/triton/Analysis/Utility.h | 399 ++ .../mthreads/include/triton/CMakeLists.txt | 3 + .../include/triton/Conversion/CMakeLists.txt | 2 + .../include/triton/Conversion/MLIRTypes.h | 46 + .../AllocateSharedMemoryUtility.h | 17 + .../Conversion/TritonGPUToLLVM/AsmFormat.h | 27 + .../Conversion/TritonGPUToLLVM/CMakeLists.txt | 3 + .../TritonGPUToLLVM/ElementwiseOpToLLVMBase.h | 206 + .../TritonGPUToLLVM/FMADotUtility.h | 35 + .../Conversion/TritonGPUToLLVM/Passes.h | 25 + .../Conversion/TritonGPUToLLVM/Passes.td | 45 + .../PatternTritonGPUOpToLLVM.h | 117 + .../TritonGPUToLLVM/TargetInfoBase.h | 115 + .../TritonGPUToLLVM/TypeConverter.h | 39 + .../Conversion/TritonGPUToLLVM/Utility.h | 652 ++ .../TritonGPUToLLVM/WarpSpecializeUtility.h | 78 + .../TritonToTritonGPU/CMakeLists.txt | 3 + .../Conversion/TritonToTritonGPU/Passes.h | 15 + .../Conversion/TritonToTritonGPU/Passes.td | 56 + .../include/triton/Dialect/CMakeLists.txt | 7 + .../triton/Dialect/Gluon/CMakeCache.txt | 2 + .../triton/Dialect/Gluon/CMakeLists.txt | 2 + .../triton/Dialect/Gluon/IR/CMakeLists.txt | 17 + .../include/triton/Dialect/Gluon/IR/Dialect.h | 11 + .../triton/Dialect/Gluon/IR/GluonAttrDefs.td | 23 + .../triton/Dialect/Gluon/IR/GluonDialect.td | 22 + .../triton/Dialect/Gluon/IR/GluonOps.td | 32 + .../Dialect/Gluon/Transforms/CMakeLists.txt | 3 + .../Gluon/Transforms/InferLayoutUtils.h | 20 + .../triton/Dialect/Gluon/Transforms/Passes.h | 13 + .../triton/Dialect/Gluon/Transforms/Passes.td | 54 + .../triton/Dialect/NVGPU/CMakeLists.txt | 1 + .../triton/Dialect/NVGPU/IR/CMakeLists.txt | 18 + .../include/triton/Dialect/NVGPU/IR/Dialect.h | 47 + .../triton/Dialect/NVGPU/IR/NVGPUAttrDefs.td | 33 + .../triton/Dialect/NVGPU/IR/NVGPUDialect.td | 40 + .../triton/Dialect/NVGPU/IR/NVGPUOps.td | 134 + .../triton/Dialect/NVWS/CMakeLists.txt | 2 + .../triton/Dialect/NVWS/IR/CMakeLists.txt | 24 + .../include/triton/Dialect/NVWS/IR/Dialect.h | 55 + .../triton/Dialect/NVWS/IR/NVWSAttrDefs.td | 70 + .../triton/Dialect/NVWS/IR/NVWSDialect.td | 45 + .../Dialect/NVWS/IR/NVWSOpInterfaces.td | 37 + .../include/triton/Dialect/NVWS/IR/NVWSOps.td | 342 + .../triton/Dialect/NVWS/IR/NVWSTypes.td | 51 + .../Dialect/NVWS/Transforms/CMakeLists.txt | 3 + .../triton/Dialect/NVWS/Transforms/Passes.h | 42 + .../triton/Dialect/NVWS/Transforms/Passes.td | 190 + .../triton/Dialect/Triton/CMakeLists.txt | 2 + .../triton/Dialect/Triton/IR/CMakeLists.txt | 31 + .../triton/Dialect/Triton/IR/Dialect.h | 121 + .../Dialect/Triton/IR/DiscardableAttributes.h | 15 + .../triton/Dialect/Triton/IR/Interfaces.h | 45 + .../triton/Dialect/Triton/IR/OpInterfaces.h | 24 + .../include/triton/Dialect/Triton/IR/Traits.h | 132 + .../Dialect/Triton/IR/TritonAttrDefs.td | 154 + .../triton/Dialect/Triton/IR/TritonDialect.td | 60 + .../Dialect/Triton/IR/TritonInterfaces.td | 31 + .../Dialect/Triton/IR/TritonOpInterfaces.td | 118 + .../triton/Dialect/Triton/IR/TritonOps.td | 1413 ++++ .../Dialect/Triton/IR/TritonTypeInterfaces.td | 53 + .../triton/Dialect/Triton/IR/TritonTypes.td | 134 + .../include/triton/Dialect/Triton/IR/Types.h | 44 + .../triton/Dialect/Triton/IR/Utility.h | 214 + .../Triton/Transforms/ArithTypeConversion.h | 18 + .../Dialect/Triton/Transforms/CMakeLists.txt | 3 + .../Transforms/FunctionTypeConversion.h | 19 + .../Dialect/Triton/Transforms/LoopPeeling.h | 18 + .../triton/Dialect/Triton/Transforms/Passes.h | 19 + .../Dialect/Triton/Transforms/Passes.td | 93 + .../triton/Dialect/TritonGPU/CMakeLists.txt | 2 + .../triton/Dialect/TritonGPU/IR/Attributes.h | 13 + .../Dialect/TritonGPU/IR/CGAEncodingAttr.h | 11 + .../Dialect/TritonGPU/IR/CGAEncodingAttr.td | 43 + .../Dialect/TritonGPU/IR/CMakeLists.txt | 40 + .../triton/Dialect/TritonGPU/IR/Dialect.h | 321 + .../TritonGPU/IR/LinearLayoutConversions.h | 155 + .../triton/Dialect/TritonGPU/IR/Traits.h | 34 + .../Dialect/TritonGPU/IR/TritonGPUAttrBase.td | 55 + .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 1567 ++++ .../TritonGPU/IR/TritonGPUAttrImpls.td | 13 + .../Dialect/TritonGPU/IR/TritonGPUDialect.td | 41 + .../Dialect/TritonGPU/IR/TritonGPUEnums.td | 22 + .../TritonGPU/IR/TritonGPUInterfaces.h | 13 + .../TritonGPU/IR/TritonGPUOpInterfaces.td | 29 + .../Dialect/TritonGPU/IR/TritonGPUOps.td | 741 ++ .../TritonGPU/IR/TritonGPUTypeInterfaces.td | 23 + .../Dialect/TritonGPU/IR/TritonGPUTypes.td | 86 + .../triton/Dialect/TritonGPU/IR/Types.h | 14 + .../TritonGPU/Transforms/CMakeLists.txt | 3 + .../TritonGPU/Transforms/CoalesceUtils.h | 17 + .../Transforms/DecomposeScaledBlocked.h | 47 + .../Transforms/LayoutPropagationUtility.h | 21 + .../Transforms/MMAv5PipelineUtility.h | 83 + .../Dialect/TritonGPU/Transforms/Partition.h | 127 + .../TritonGPU/Transforms/PartitionBuilder.h | 49 + .../Transforms/PartitionSchedulingUtility.h | 460 ++ .../Dialect/TritonGPU/Transforms/Passes.h | 23 + .../Dialect/TritonGPU/Transforms/Passes.td | 343 + .../TritonGPU/Transforms/PipelineExpander.h | 111 + .../TritonGPU/Transforms/PipeliningUtility.h | 189 + .../Dialect/TritonGPU/Transforms/Schedule.h | 285 + .../Transforms/TritonGPUConversion.h | 63 + .../Dialect/TritonGPU/Transforms/Utility.h | 303 + .../TritonGPU/Transforms/WarpSpecialization.h | 24 + .../Dialect/TritonInstrument/CMakeLists.txt | 2 + .../TritonInstrument/IR/CMakeLists.txt | 15 + .../Dialect/TritonInstrument/IR/Dialect.h | 14 + .../TritonInstrument/IR/FunctionBuilder.h | 227 + .../TritonInstrument/IR/TritonInstrument.md | 86 + .../IR/TritonInstrumentAttrDefs.td | 15 + .../IR/TritonInstrumentDialect.td | 11 + .../IR/TritonInstrumentOps.td | 96 + .../Dialect/TritonInstrument/IR/Utility.h | 101 + .../Transforms/CMakeLists.txt | 3 + .../TritonInstrument/Transforms/Passes.h | 22 + .../TritonInstrument/Transforms/Passes.td | 16 + .../Dialect/TritonNvidiaGPU/CMakeLists.txt | 2 + .../Dialect/TritonNvidiaGPU/IR/CMakeLists.txt | 27 + .../Dialect/TritonNvidiaGPU/IR/Dialect.h | 147 + .../TritonNvidiaGPU/IR/TensorMemoryUtils.h | 37 + .../IR/TritonNvidiaGPUAttrDefs.td | 92 + .../IR/TritonNvidiaGPUDialect.td | 49 + .../IR/TritonNvidiaGPUOpInterfaces.td | 74 + .../TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td | 933 +++ .../IR/TritonNvidiaGPUTypes.td | 90 + .../TritonNvidiaGPU/Transforms/CMakeLists.txt | 3 + .../TritonNvidiaGPU/Transforms/Passes.h | 46 + .../TritonNvidiaGPU/Transforms/Passes.td | 187 + .../TritonNvidiaGPU/Transforms/TMAUtilities.h | 57 + .../include/triton/Target/CMakeLists.txt | 1 + .../triton/Target/LLVMIR/CMakeLists.txt | 3 + .../include/triton/Target/LLVMIR/Passes.h | 18 + .../include/triton/Target/LLVMIR/Passes.td | 21 + .../include/triton/Tools/GenericSwizzling.h | 56 + .../include/triton/Tools/LayoutUtils.h | 190 + .../include/triton/Tools/LinearLayout.h | 904 +++ .../include/triton/Tools/PluginUtils.h | 100 + .../mthreads/include/triton/Tools/StrUtil.h | 54 + .../include/triton/Tools/Sys/GetEnv.hpp | 124 + .../mthreads/language/musa/__init__.py | 3 + .../mthreads/language/musa/libdevice.py | 1742 +++++ third_party/mthreads/language/musa/utils.py | 86 + third_party/mthreads/lib/Analysis/Alias.cpp | 72 + .../mthreads/lib/Analysis/Allocation.cpp | 662 ++ .../mthreads/lib/Analysis/AxisInfo.cpp | 1422 ++++ .../mthreads/lib/Analysis/BufferRegion.cpp | 360 + .../mthreads/lib/Analysis/CMakeLists.txt | 23 + third_party/mthreads/lib/Analysis/Membar.cpp | 394 + third_party/mthreads/lib/Analysis/Utility.cpp | 1168 +++ third_party/mthreads/lib/CMakeLists.txt | 5 + .../mthreads/lib/Conversion/CMakeLists.txt | 3 + .../TritonGPUToLLVM/AllocateSharedMemory.cpp | 27 + .../AllocateSharedMemoryUtility.cpp | 34 + .../TritonGPUToLLVM/AllocateWarpGroups.cpp | 217 + .../TritonGPUToLLVM/AssertOpToLLVM.cpp | 106 + .../Conversion/TritonGPUToLLVM/CMakeLists.txt | 41 + .../TritonGPUToLLVM/ControlFlowOpToLLVM.cpp | 165 + .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp | 603 ++ .../TritonGPUToLLVM/DotOpToLLVM/FMA.cpp | 74 + .../DotOpToLLVM/FMADotUtility.cpp | 170 + .../TritonGPUToLLVM/ElementwiseOpToLLVM.cpp | 752 ++ .../TritonGPUToLLVM/FuncOpToLLVM.cpp | 149 + .../TritonGPUToLLVM/GatherOpToLLVM.cpp | 349 + .../GlobalScratchMemoryAllocation.cpp | 105 + .../TritonGPUToLLVM/HistogramOpToLLVM.cpp | 225 + .../TritonGPUToLLVM/MakeRangeOpToLLVM.cpp | 54 + .../TritonGPUToLLVM/MemoryOpToLLVM.cpp | 463 ++ .../TritonGPUToLLVM/PrintOpToLLVM.cpp | 243 + .../TritonGPUToLLVM/ReduceOpToLLVM.cpp | 391 + .../TritonGPUToLLVM/ReduceScanCommon.h | 163 + .../TritonGPUToLLVM/SPMDOpToLLVM.cpp | 37 + .../TritonGPUToLLVM/ScanOpToLLVM.cpp | 585 ++ .../TritonGPUToLLVM/TypeConverter.cpp | 77 + .../Conversion/TritonGPUToLLVM/Utility.cpp | 1732 +++++ .../TritonGPUToLLVM/ViewOpToLLVM.cpp | 645 ++ .../TritonGPUToLLVM/WarpSpecializeUtility.cpp | 376 + .../TritonInstrumentToLLVM/CMakeLists.txt | 12 + .../InstrumentationToLLVM.cpp | 371 + .../TritonToTritonGPU/CMakeLists.txt | 16 + .../TritonToTritonGPU/RelayoutTritonGPU.cpp | 132 + .../TritonToTritonGPU/TritonGPUConversion.cpp | 186 + .../TritonToTritonGPUPass.cpp | 839 +++ .../mthreads/lib/Dialect/CMakeLists.txt | 7 + .../mthreads/lib/Dialect/Gluon/CMakeLists.txt | 2 + .../lib/Dialect/Gluon/IR/CMakeLists.txt | 10 + .../mthreads/lib/Dialect/Gluon/IR/Dialect.cpp | 138 + .../Dialect/Gluon/Transforms/CMakeLists.txt | 17 + .../Dialect/Gluon/Transforms/Canonicalize.cpp | 64 + .../Transforms/InferCoalescedEncodings.cpp | 112 + .../Gluon/Transforms/InferLayoutUtils.cpp | 251 + .../lib/Dialect/Gluon/Transforms/Inline.cpp | 29 + .../Gluon/Transforms/ResolveAutoEncodings.cpp | 71 + .../Gluon/Transforms/SimplifyControlFlow.cpp | 49 + .../mthreads/lib/Dialect/NVGPU/CMakeLists.txt | 1 + .../lib/Dialect/NVGPU/IR/CMakeLists.txt | 10 + .../mthreads/lib/Dialect/NVGPU/IR/Dialect.cpp | 49 + .../mthreads/lib/Dialect/NVWS/CMakeLists.txt | 2 + .../lib/Dialect/NVWS/IR/CMakeLists.txt | 12 + .../mthreads/lib/Dialect/NVWS/IR/Dialect.cpp | 57 + .../mthreads/lib/Dialect/NVWS/IR/Ops.cpp | 185 + .../NVWS/Transforms/AssignStagePhase.cpp | 563 ++ .../Dialect/NVWS/Transforms/CMakeLists.txt | 19 + .../NVWS/Transforms/HoistTmemStore.cpp | 362 + .../Dialect/NVWS/Transforms/InsertAref.cpp | 641 ++ .../NVWS/Transforms/InsertTmemAref.cpp | 930 +++ .../lib/Dialect/NVWS/Transforms/LowerAref.cpp | 961 +++ .../NVWS/Transforms/LowerWarpGroup.cpp | 256 + .../lib/Dialect/NVWS/Transforms/Utilities.cpp | 65 + .../lib/Dialect/NVWS/Transforms/Utilities.h | 43 + .../lib/Dialect/Triton/CMakeLists.txt | 2 + .../lib/Dialect/Triton/IR/CMakeLists.txt | 23 + .../lib/Dialect/Triton/IR/Canonicalize.td | 17 + .../lib/Dialect/Triton/IR/Dialect.cpp | 77 + .../Triton/IR/DiscardableAttributes.cpp | 17 + .../lib/Dialect/Triton/IR/OpInterfaces.cpp | 77 + .../mthreads/lib/Dialect/Triton/IR/Ops.cpp | 1490 ++++ .../mthreads/lib/Dialect/Triton/IR/Traits.cpp | 265 + .../mthreads/lib/Dialect/Triton/IR/Types.cpp | 140 + .../lib/Dialect/Triton/IR/Utility.cpp | 204 + .../Triton/Transforms/ArithTypeConversion.cpp | 51 + .../Dialect/Triton/Transforms/CMakeLists.txt | 27 + .../lib/Dialect/Triton/Transforms/Combine.cpp | 298 + .../lib/Dialect/Triton/Transforms/Combine.td | 23 + .../Transforms/FunctionTypeConversion.cpp | 163 + .../Triton/Transforms/LoopAwareCSE.cpp | 178 + .../Transforms/LoopInvariantCodeMotion.cpp | 82 + .../Dialect/Triton/Transforms/LoopPeeling.cpp | 67 + .../Dialect/Triton/Transforms/LoopUnroll.cpp | 62 + .../Triton/Transforms/ReorderBroadcast.cpp | 230 + .../RewriteTensorDescriptorToPointer.cpp | 616 ++ .../Transforms/RewriteTensorPointer.cpp | 566 ++ .../lib/Dialect/TritonGPU/CMakeLists.txt | 2 + .../lib/Dialect/TritonGPU/IR/CMakeLists.txt | 18 + .../lib/Dialect/TritonGPU/IR/Dialect.cpp | 4561 ++++++++++++ .../TritonGPU/IR/LinearLayoutConversions.cpp | 1975 +++++ .../mthreads/lib/Dialect/TritonGPU/IR/Ops.cpp | 1412 ++++ .../lib/Dialect/TritonGPU/IR/Types.cpp | 226 + .../TritonGPU/Transforms/AccelerateMatmul.cpp | 1027 +++ .../TritonGPU/Transforms/CMakeLists.txt | 56 + .../Dialect/TritonGPU/Transforms/Coalesce.cpp | 125 + .../Transforms/CoalesceAsyncCopy.cpp | 214 + .../TritonGPU/Transforms/CoalesceUtils.cpp | 96 + .../Transforms/CombineTensorSelectAndIf.cpp | 176 + .../Transforms/DecomposeScaledBlocked.cpp | 261 + .../Dialect/TritonGPU/Transforms/F32DotTC.cpp | 241 + .../TritonGPU/Transforms/FuseNestedLoops.cpp | 1222 ++++ .../TritonGPU/Transforms/HoistTMEMAlloc.cpp | 586 ++ .../Transforms/LayoutPropagationUtility.cpp | 49 + .../Transforms/OptimizeAccumulatorInit.cpp | 311 + .../Transforms/OptimizeDotOperands.cpp | 358 + .../Transforms/OptimizeThreadLocality.cpp | 583 ++ .../Transforms/Pipeliner/AssignLatencies.cpp | 395 + .../Transforms/Pipeliner/LowerLoops.cpp | 1184 +++ .../Pipeliner/MMAv5PipelineUtility.cpp | 317 + .../Transforms/Pipeliner/PipelineExpander.cpp | 869 +++ .../Pipeliner/PipeliningUtility.cpp | 910 +++ .../Transforms/Pipeliner/Schedule.cpp | 419 ++ .../Transforms/Pipeliner/ScheduleLoops.cpp | 415 ++ .../Pipeliner/SoftwarePipeliner.cpp | 228 + .../Pipeliner/TMAStoresPipeline.cpp | 125 + .../Pipeliner/TestPipelineLowerLoop.cpp | 32 + .../Transforms/Pipeliner/WGMMAPipeline.cpp | 769 ++ .../Dialect/TritonGPU/Transforms/Prefetch.cpp | 457 ++ .../Transforms/ReduceDataDuplication.cpp | 68 + .../Transforms/RemoveLayoutConversions.cpp | 1701 +++++ .../Transforms/ReorderInstructions.cpp | 178 + .../Dialect/TritonGPU/Transforms/Utility.cpp | 1759 +++++ .../AutomaticWarpSpecialization.cpp | 83 + .../OptimizePartitionWarps.cpp | 318 + .../WarpSpecialization/Partition.cpp | 244 + .../WarpSpecialization/PartitionBuilder.cpp | 36 + .../WarpSpecialization/PartitionLoops.cpp | 545 ++ .../PartitionScheduling.cpp | 1605 +++++ .../PartitionSchedulingUtility.cpp | 370 + .../Dialect/TritonInstrument/CMakeLists.txt | 2 + .../TritonInstrument/IR/CMakeLists.txt | 14 + .../Dialect/TritonInstrument/IR/Dialect.cpp | 17 + .../TritonInstrument/IR/FunctionBuilder.cpp | 2110 ++++++ .../lib/Dialect/TritonInstrument/IR/Ops.cpp | 8 + .../Dialect/TritonInstrument/IR/Utility.cpp | 581 ++ .../Transforms/CMakeLists.txt | 16 + .../Transforms/ConcurrencySanitizer.cpp | 581 ++ .../Dialect/TritonNvidiaGPU/CMakeLists.txt | 2 + .../Dialect/TritonNvidiaGPU/IR/CMakeLists.txt | 15 + .../Dialect/TritonNvidiaGPU/IR/Dialect.cpp | 573 ++ .../lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp | 1160 +++ .../TritonNvidiaGPU/IR/TensorMemoryUtils.cpp | 309 + .../TritonNvidiaGPU/Transforms/CMakeLists.txt | 25 + .../Transforms/CheckMatmulTwoCTAs.cpp | 63 + .../Transforms/FenceInsertion.cpp | 151 + .../Transforms/InterleaveTMem.cpp | 283 + .../Transforms/MMALowering.cpp | 222 + .../Transforms/OptimizeDescriptorEncoding.cpp | 400 ++ .../Transforms/OptimizeTMemLayouts.cpp | 448 ++ .../TritonNvidiaGPU/Transforms/PlanCTA.cpp | 1038 +++ .../Transforms/PromoteLHSToTMem.cpp | 117 + .../Transforms/ProxyFenceInsertion.cpp | 193 + .../Transforms/RemoveTMEMTokens.cpp | 85 + .../Transforms/TMALowering.cpp | 204 + .../Transforms/TMAUtilities.cpp | 290 + .../Transforms/TensorMemoryAllocation.cpp | 436 ++ .../mthreads/lib/Target/CMakeLists.txt | 1 + .../mthreads/lib/Target/LLVMIR/CMakeLists.txt | 31 + .../lib/Target/LLVMIR/LLVMDILocalVariable.cpp | 289 + .../lib/Target/LLVMIR/LLVMDIScope.cpp | 193 + .../lib/Target/LLVMIR/LLVMDIUtils.cpp | 159 + .../mthreads/lib/Target/LLVMIR/LLVMDIUtils.h | 25 + .../Target/LLVMIR/LLVMIRBreakPhiStruct.cpp | 60 + .../mthreads/lib/Target/LLVMIR/LLVMPasses.h | 16 + third_party/mthreads/lib/Tools/CMakeLists.txt | 13 + .../mthreads/lib/Tools/GenericSwizzling.cpp | 713 ++ .../mthreads/lib/Tools/LayoutUtils.cpp | 582 ++ .../mthreads/lib/Tools/LinearLayout.cpp | 1407 ++++ .../mthreads/lib/Tools/PluginUtils.cpp | 162 + third_party/mthreads/musa/CMakeLists.txt | 2 + .../mthreads/musa/include/CMakeLists.txt | 4 + .../musa/include/Dialect/CMakeLists.txt | 2 + .../musa/include/Dialect/MTGPU/CMakeLists.txt | 1 + .../include/Dialect/MTGPU/IR/CMakeLists.txt | 17 + .../musa/include/Dialect/MTGPU/IR/Dialect.h | 28 + .../include/Dialect/MTGPU/IR/MTGPUDialect.td | 21 + .../musa/include/Dialect/MTGPU/IR/MTGPUOps.td | 125 + .../include/Dialect/MTGPU/IR/MTGPUTypes.td | 32 + .../musa/include/Dialect/MUSA/CMakeLists.txt | 1 + .../include/Dialect/MUSA/IR/CMakeLists.txt | 18 + .../musa/include/Dialect/MUSA/IR/Dialect.h | 23 + .../include/Dialect/MUSA/IR/MUSAAttrDefs.td | 78 + .../include/Dialect/MUSA/IR/MUSADialect.td | 19 + .../musa/include/Dialect/MUSA/IR/MUSAOps.td | 243 + .../musa/include/MTGPUToLLVM/CMakeLists.txt | 3 + .../include/MTGPUToLLVM/MTGPUToLLVMPass.h | 40 + .../musa/include/MTGPUToLLVM/Passes.h | 35 + .../musa/include/MTGPUToLLVM/Passes.td | 24 + .../include/TritonMUSACommon/BarrierUtils.h | 114 + .../TritonMUSACommon/MMAContractUtils.h | 376 + .../TritonMUSACommon/MMAEncodingUtils.h | 24 + .../TritonMUSACommon/MMAOperandUtils.h | 529 ++ .../include/TritonMUSACommon/MatmulPolicy.h | 4 + .../include/TritonMUSACommon/MemDescUtils.h | 479 ++ .../include/TritonMUSACommon/SqmmaAttrUtils.h | 67 + .../musa/include/TritonMUSACommon/TMEUtils.h | 779 ++ .../include/TritonMUSAGPUToLLVM/Allocation.h | 25 + .../TritonMUSAGPUToLLVM/CMakeLists.txt | 3 + .../musa/include/TritonMUSAGPUToLLVM/Passes.h | 37 + .../include/TritonMUSAGPUToLLVM/Passes.td | 37 + .../include/TritonMUSAGPUToLLVM/TargetInfo.h | 73 + .../include/TritonMUSAGPUToLLVM/Utility.h | 83 + .../TritonMUSAGPUTransforms/CMakeLists.txt | 3 + .../include/TritonMUSAGPUTransforms/Passes.h | 20 + .../include/TritonMUSAGPUTransforms/Passes.td | 193 + third_party/mthreads/musa/lib/CMakeLists.txt | 4 + .../mthreads/musa/lib/Dialect/CMakeLists.txt | 2 + .../musa/lib/Dialect/MTGPU/CMakeLists.txt | 1 + .../musa/lib/Dialect/MTGPU/IR/CMakeLists.txt | 10 + .../musa/lib/Dialect/MTGPU/IR/Dialect.cpp | 287 + .../musa/lib/Dialect/MUSA/CMakeLists.txt | 1 + .../musa/lib/Dialect/MUSA/IR/CMakeLists.txt | 10 + .../musa/lib/Dialect/MUSA/IR/Dialect.cpp | 577 ++ .../musa/lib/MTGPUToLLVM/CMakeLists.txt | 14 + .../musa/lib/MTGPUToLLVM/MTGPUToLLVMPass.cpp | 163 + .../AllocateSharedMemory.cpp | 269 + .../TritonMUSAGPUToLLVM/BarrierOpToLLVM.cpp | 57 + .../lib/TritonMUSAGPUToLLVM/CMakeLists.txt | 33 + .../ConvertLayoutOpToLLVM.cpp | 551 ++ .../lib/TritonMUSAGPUToLLVM/DotOpToLLVM.cpp | 48 + .../DotOpToLLVM/DotOpToLLVM.h | 35 + .../TritonMUSAGPUToLLVM/DotOpToLLVM/SQMMA.cpp | 889 +++ .../TritonMUSAGPUToLLVM/DotOpToLLVM/WMMA.cpp | 638 ++ .../ElementwiseOpToLLVM.cpp | 891 +++ .../TritonMUSAGPUToLLVM/LoadStoreOpToLLVM.cpp | 1561 ++++ .../lib/TritonMUSAGPUToLLVM/MUSAOpsToLLVM.cpp | 741 ++ .../PatternTritonGPUOpToLLVM.h | 62 + .../lib/TritonMUSAGPUToLLVM/SPMDOpToLLVM.cpp | 49 + .../lib/TritonMUSAGPUToLLVM/TargetInfo.cpp | 232 + .../TensorPtrOpsToLLVM.cpp | 60 + .../TritonMUSAGPUToLLVM/ThreadIdOpToLLVM.cpp | 47 + .../TritonMUSAGPUToLLVM/TritonGPUToLLVM.cpp | 504 ++ .../musa/lib/TritonMUSAGPUToLLVM/Utility.cpp | 437 ++ .../TritonMUSAGPUToLLVM/WarpIdOpToLLVM.cpp | 54 + .../AccelerateMUSAMatmul.cpp | 977 +++ .../TritonMUSAGPUTransforms/CMakeLists.txt | 33 + .../CanonicalizeSqmmaResultConversions.cpp | 104 + .../ConvertSqmmaToMTGPU.cpp | 339 + .../FinalizeBarriers.cpp | 28 + .../IssueBarrierInsertion.cpp | 129 + .../MarkInplaceLoads.cpp | 110 + .../OptimizeAccumulatorInit.cpp | 212 + .../OptimizeDescriptorEncoding.cpp | 587 ++ .../OptimizeDotOperands.cpp | 237 + .../OptimizeSqmmaAccumulatorLayout.cpp | 157 + .../lib/TritonMUSAGPUTransforms/Pipeline.cpp | 1409 ++++ .../SqmmaPipelineUtils.cpp | 508 ++ .../SqmmaPipelineUtils.h | 12 + .../TritonMUSAGPUTransforms/TMELowering.cpp | 203 + .../TMEPipelineUtils.cpp | 278 + .../TMEPipelineUtils.h | 16 + third_party/mthreads/proton/.gitignore | 6 + third_party/mthreads/proton/CMakeLists.txt | 84 + .../mthreads/proton/Dialect/CMakeLists.txt | 17 + .../include/Analysis/ScopeIdAllocation.h | 91 + .../proton/Dialect/include/CMakeLists.txt | 2 + .../Dialect/include/Conversion/CMakeLists.txt | 2 + .../Conversion/ProtonGPUToLLVM/CMakeLists.txt | 6 + .../Conversion/ProtonGPUToLLVM/Passes.h | 28 + .../Conversion/ProtonGPUToLLVM/Passes.td | 34 + .../PatternProtonGPUOpToLLVM.h | 22 + .../AMDPatternProtonGPUOpToLLVM.h | 20 + .../ProtonAMDGPUToLLVM/CMakeLists.txt | 3 + .../ProtonAMDGPUToLLVM/Passes.h | 30 + .../ProtonAMDGPUToLLVM/Passes.td | 31 + .../ProtonAMDGPUToLLVM/TargetInfo.h | 40 + .../ProtonNvidiaGPUToLLVM/CMakeLists.txt | 3 + .../NvidiaPatternProtonGPUOpToLLVM.h | 20 + .../ProtonNvidiaGPUToLLVM/Passes.h | 31 + .../ProtonNvidiaGPUToLLVM/Passes.td | 34 + .../ProtonNvidiaGPUToLLVM/TargetInfo.h | 34 + .../ProtonGPUToLLVM/TargetInfoBase.h | 43 + .../Conversion/ProtonGPUToLLVM/Utility.h | 55 + .../ProtonToProtonGPU/CMakeLists.txt | 3 + .../Conversion/ProtonToProtonGPU/Passes.h | 31 + .../Conversion/ProtonToProtonGPU/Passes.td | 80 + .../Dialect/include/Dialect/CMakeLists.txt | 2 + .../include/Dialect/Proton/CMakeLists.txt | 1 + .../include/Dialect/Proton/IR/CMakeLists.txt | 18 + .../include/Dialect/Proton/IR/Dialect.h | 14 + .../Dialect/Proton/IR/ProtonAttrDefs.td | 46 + .../Dialect/Proton/IR/ProtonDialect.td | 20 + .../include/Dialect/Proton/IR/ProtonOps.td | 45 + .../include/Dialect/ProtonGPU/CMakeLists.txt | 2 + .../Dialect/ProtonGPU/IR/CMakeLists.txt | 23 + .../include/Dialect/ProtonGPU/IR/Dialect.h | 35 + .../Dialect/ProtonGPU/IR/ProtonGPUAttrDefs.td | 71 + .../Dialect/ProtonGPU/IR/ProtonGPUDialect.td | 28 + .../Dialect/ProtonGPU/IR/ProtonGPUOps.td | 204 + .../Dialect/ProtonGPU/IR/ProtonGPUTypes.td | 43 + .../include/Dialect/ProtonGPU/IR/Types.h | 15 + .../ProtonGPU/Transforms/CMakeLists.txt | 3 + .../Dialect/ProtonGPU/Transforms/Passes.h | 17 + .../Dialect/ProtonGPU/Transforms/Passes.td | 15 + .../Dialect/include/compat/PTXAsmFormat.h | 347 + .../Dialect/include/compat/TargetInfo.h | 86 + .../proton/Dialect/include/compat/Utility.h | 71 + .../Dialect/lib/Analysis/CMakeLists.txt | 10 + .../lib/Analysis/ScopeIdAllocation.cpp | 403 ++ .../proton/Dialect/lib/CMakeLists.txt | 5 + .../proton/Dialect/lib/Dialect/CMakeLists.txt | 2 + .../Dialect/lib/Dialect/Proton/CMakeLists.txt | 1 + .../lib/Dialect/Proton/IR/CMakeLists.txt | 8 + .../Dialect/lib/Dialect/Proton/IR/Dialect.cpp | 34 + .../Dialect/lib/Dialect/Proton/IR/Ops.cpp | 12 + .../lib/Dialect/ProtonGPU/CMakeLists.txt | 2 + .../lib/Dialect/ProtonGPU/IR/CMakeLists.txt | 14 + .../lib/Dialect/ProtonGPU/IR/Dialect.cpp | 33 + .../Dialect/lib/Dialect/ProtonGPU/IR/Ops.cpp | 65 + .../lib/Dialect/ProtonGPU/IR/Types.cpp | 23 + .../ProtonGPU/Transforms/CMakeLists.txt | 8 + .../Transforms/ProtonGPUTransformsPass.cpp | 52 + .../AllocateProtonGlobalScratchBuffer.cpp | 51 + .../AllocateProtonSharedMemory.cpp | 60 + .../lib/ProtonGPUToLLVM/CMakeLists.txt | 19 + .../PatternProtonGPUOpToLLVM.cpp | 827 +++ .../AMDPatternProtonGPUOpToLLVM.cpp | 68 + .../ProtonAMDGPUToLLVM/AddSchedBarriers.cpp | 64 + .../ProtonAMDGPUToLLVM/CMakeLists.txt | 15 + .../ConvertProtonGPUToLLVM.cpp | 104 + .../ProtonAMDGPUToLLVM/TargetInfo.cpp | 160 + .../ProtonNvidiaGPUToLLVM/CMakeLists.txt | 38 + .../ConvertProtonGPUToLLVM.cpp | 106 + .../NvidiaPatternProtonGPUOpToLLVM.cpp | 106 + .../ProtonNvidiaGPUToLLVM/TargetInfo.cpp | 81 + .../Dialect/lib/ProtonGPUToLLVM/Utility.cpp | 179 + .../lib/ProtonToProtonGPU/CMakeLists.txt | 11 + .../ProtonToProtonGPUPass.cpp | 416 ++ .../Dialect/lib/compat/PTXAsmFormat.cpp | 237 + .../proton/Dialect/lib/compat/TargetInfo.cpp | 617 ++ .../proton/Dialect/lib/compat/Utility.cpp | 439 ++ .../mthreads/proton/Dialect/triton_proton.cc | 118 + third_party/mthreads/proton/README.md | 424 ++ .../mthreads/proton/common/CMakeLists.txt | 1 + .../mthreads/proton/common/include/Device.h | 48 + .../common/include/TraceDataIO/ByteSpan.h | 53 + .../TraceDataIO/CircularLayoutParser.h | 98 + .../common/include/TraceDataIO/EntryDecoder.h | 77 + .../common/include/TraceDataIO/Parser.h | 58 + .../common/include/TraceDataIO/TraceWriter.h | 71 + .../mthreads/proton/common/lib/CMakeLists.txt | 1 + .../common/lib/TraceDataIO/ByteSpan.cpp | 77 + .../common/lib/TraceDataIO/CMakeLists.txt | 7 + .../lib/TraceDataIO/CircularLayoutParser.cpp | 254 + .../common/lib/TraceDataIO/EntryDecoder.cpp | 34 + .../proton/common/lib/TraceDataIO/Parser.cpp | 25 + .../common/lib/TraceDataIO/TraceWriter.cpp | 249 + .../mthreads/proton/csrc/CMakeLists.txt | 5 + third_party/mthreads/proton/csrc/Proton.cpp | 209 + .../proton/csrc/include/Context/Context.h | 155 + .../proton/csrc/include/Context/Python.h | 21 + .../proton/csrc/include/Context/Shadow.h | 49 + .../mthreads/proton/csrc/include/Data/Data.h | 196 + .../proton/csrc/include/Data/Metric.h | 528 ++ .../proton/csrc/include/Data/PhaseStore.h | 111 + .../proton/csrc/include/Data/TraceData.h | 58 + .../proton/csrc/include/Data/TreeData.h | 72 + .../proton/csrc/include/Driver/Dispatch.h | 168 + .../proton/csrc/include/Driver/GPU/CudaApi.h | 72 + .../proton/csrc/include/Driver/GPU/CuptiApi.h | 129 + .../proton/csrc/include/Driver/GPU/HipApi.h | 79 + .../proton/csrc/include/Driver/GPU/HsaApi.h | 23 + .../proton/csrc/include/Driver/GPU/NvtxApi.h | 20 + .../csrc/include/Driver/GPU/RoctracerApi.h | 95 + .../include/Profiler/Cupti/CuptiPCSampling.h | 142 + .../include/Profiler/Cupti/CuptiProfiler.h | 22 + .../csrc/include/Profiler/GPUProfiler.h | 292 + .../proton/csrc/include/Profiler/Graph.h | 124 + .../Instrumentation/InstrumentationProfiler.h | 78 + .../Profiler/Instrumentation/Metadata.h | 30 + .../proton/csrc/include/Profiler/Profiler.h | 141 + .../Profiler/Roctracer/RoctracerProfiler.h | 22 + .../mthreads/proton/csrc/include/Proton.h | 9 + .../proton/csrc/include/Runtime/CudaRuntime.h | 42 + .../proton/csrc/include/Runtime/HipRuntime.h | 42 + .../proton/csrc/include/Runtime/Runtime.h | 66 + .../proton/csrc/include/Session/Session.h | 241 + .../proton/csrc/include/Utility/Atomic.h | 39 + .../proton/csrc/include/Utility/Env.h | 40 + .../proton/csrc/include/Utility/Errors.h | 15 + .../proton/csrc/include/Utility/Map.h | 107 + .../csrc/include/Utility/MsgPackWriter.h | 33 + .../proton/csrc/include/Utility/Numeric.h | 21 + .../proton/csrc/include/Utility/Set.h | 45 + .../proton/csrc/include/Utility/Singleton.h | 22 + .../proton/csrc/include/Utility/String.h | 69 + .../proton/csrc/include/Utility/Table.h | 83 + .../proton/csrc/include/Utility/Traits.h | 33 + .../proton/csrc/include/Utility/Vector.h | 82 + .../mthreads/proton/csrc/lib/CMakeLists.txt | 7 + .../proton/csrc/lib/Context/CMakeLists.txt | 5 + .../proton/csrc/lib/Context/Context.cpp | 12 + .../proton/csrc/lib/Context/Python.cpp | 83 + .../proton/csrc/lib/Context/Shadow.cpp | 52 + .../proton/csrc/lib/Data/CMakeLists.txt | 6 + .../mthreads/proton/csrc/lib/Data/Data.cpp | 120 + .../mthreads/proton/csrc/lib/Data/Metric.cpp | 199 + .../proton/csrc/lib/Data/TraceData.cpp | 515 ++ .../proton/csrc/lib/Data/TreeData.cpp | 740 ++ .../proton/csrc/lib/Driver/CMakeLists.txt | 9 + .../proton/csrc/lib/Driver/Device.cpp | 28 + .../proton/csrc/lib/Driver/GPU/CudaApi.cpp | 98 + .../proton/csrc/lib/Driver/GPU/CuptiApi.cpp | 111 + .../proton/csrc/lib/Driver/GPU/HipApi.cpp | 129 + .../proton/csrc/lib/Driver/GPU/HsaApi.cpp | 36 + .../proton/csrc/lib/Driver/GPU/NvtxApi.cpp | 39 + .../csrc/lib/Driver/GPU/RoctracerApi.cpp | 95 + .../proton/csrc/lib/Profiler/CMakeLists.txt | 10 + .../lib/Profiler/Cupti/CuptiPCSampling.cpp | 456 ++ .../csrc/lib/Profiler/Cupti/CuptiProfiler.cpp | 816 +++ .../proton/csrc/lib/Profiler/GPUProfiler.cpp | 280 + .../proton/csrc/lib/Profiler/Graph.cpp | 191 + .../InstrumentationProfiler.cpp | 274 + .../lib/Profiler/Instrumentation/Metadata.cpp | 28 + .../proton/csrc/lib/Profiler/Profiler.cpp | 7 + .../Profiler/RocTracer/RoctracerProfiler.cpp | 452 ++ .../proton/csrc/lib/Runtime/CMakeLists.txt | 4 + .../proton/csrc/lib/Runtime/CudaRuntime.cpp | 120 + .../proton/csrc/lib/Runtime/HipRuntime.cpp | 113 + .../proton/csrc/lib/Session/CMakeLists.txt | 3 + .../proton/csrc/lib/Session/Session.cpp | 361 + .../proton/csrc/lib/Utility/CMakeLists.txt | 3 + .../proton/csrc/lib/Utility/MsgPackWriter.cpp | 118 + .../mthreads/proton/proton/__init__.py | 12 + third_party/mthreads/proton/proton/context.py | 18 + third_party/mthreads/proton/proton/data.py | 96 + third_party/mthreads/proton/proton/flags.py | 28 + .../mthreads/proton/proton/hooks/__init__.py | 4 + .../mthreads/proton/proton/hooks/hook.py | 128 + .../proton/proton/hooks/instrumentation.py | 348 + .../mthreads/proton/proton/hooks/launch.py | 121 + .../mthreads/proton/proton/language.py | 65 + third_party/mthreads/proton/proton/metric.py | 91 + third_party/mthreads/proton/proton/mode.py | 123 + third_party/mthreads/proton/proton/profile.py | 262 + third_party/mthreads/proton/proton/proton.py | 88 + third_party/mthreads/proton/proton/scope.py | 133 + third_party/mthreads/proton/proton/specs.py | 69 + third_party/mthreads/proton/proton/state.py | 69 + third_party/mthreads/proton/proton/viewer.py | 428 ++ .../mthreads/proton/scripts/dump_ttgir.sh | 21 + .../mthreads/proton/test/CMakeLists.txt | 3 + third_party/mthreads/proton/test/conftest.py | 12 + .../mthreads/proton/test/examples/cuda.json | 86 + .../mthreads/proton/test/examples/frame.json | 58 + .../mthreads/proton/test/examples/hip.json | 86 + .../proton/test/examples/leaf_nodes.json | 168 + .../mthreads/proton/test/examples/triton.json | 73 + third_party/mthreads/proton/test/helper.py | 40 + .../mthreads/proton/test/helper_kernels.py | 45 + .../mthreads/proton/test/override_helper.py | 54 + third_party/mthreads/proton/test/test_api.py | 435 ++ third_party/mthreads/proton/test/test_cmd.py | 30 + .../proton/test/test_instrumentation.py | 1003 +++ third_party/mthreads/proton/test/test_lib.py | 95 + .../mthreads/proton/test/test_override.py | 101 + .../mthreads/proton/test/test_profile.py | 1187 ++++ .../mthreads/proton/test/test_viewer.py | 199 + .../proton/test/unittest/CMakeLists.txt | 1 + .../unittest/TraceDataIO/ByteSpanTest.cpp | 76 + .../test/unittest/TraceDataIO/CMakeLists.txt | 15 + .../TraceDataIO/ChromeTraceWriterTest.cpp | 211 + .../TraceDataIO/CircularLayoutParserTest.cpp | 275 + .../test/unittest/TraceDataIO/DecoderTest.cpp | 20 + .../proton/test/unittest/util/loop.bin | Bin 0 -> 568 bytes .../proton/test/unittest/util/seq.bin | Bin 0 -> 824 bytes .../proton/test/unittest/util/trace_gen.py | 74 + .../mthreads/proton/tutorials/dynamic-net.py | 103 + .../proton/tutorials/intra_kernel/README.md | 123 + .../tutorials/intra_kernel/example_dsl.py | 317 + .../intra_kernel/example_override.py | 98 + .../intra_kernel/insert_proton_records | 112 + .../mthreads/proton/tutorials/matmul.py | 318 + third_party/mthreads/python/src/gluon_ir.cc | 1189 ++++ .../mthreads/python/src/interpreter.cc | 740 ++ third_party/mthreads/python/src/ir.cc | 2094 ++++++ third_party/mthreads/python/src/ir.h | 100 + .../mthreads/python/src/linear_layout.cc | 223 + third_party/mthreads/python/src/llvm.cc | 943 +++ third_party/mthreads/python/src/main.cc | 62 + third_party/mthreads/python/src/passes.cc | 161 + third_party/mthreads/python/src/passes.h | 43 + third_party/mthreads/python/src/specialize.cc | 584 ++ third_party/mthreads/python/test/conftest.py | 66 + .../python/test/unit/language/print_helper.py | 170 + .../test/unit/language/test_annotations.py | 85 + .../test/unit/language/test_block_pointer.py | 118 + .../test/unit/language/test_compile_errors.py | 567 ++ .../test/unit/language/test_compile_only.py | 201 + .../test/unit/language/test_conversions.py | 444 ++ .../python/test/unit/language/test_core.py | 6325 +++++++++++++++++ .../test/unit/language/test_decorator.py | 50 + .../test/unit/language/test_frontend.py | 618 ++ .../test/unit/language/test_libdevice.py | 52 + .../test/unit/language/test_line_info.py | 447 ++ .../python/test/unit/language/test_module.py | 6 + .../test/unit/language/test_musa_ut_056.py | 152 + .../python/test/unit/language/test_mxfp.py | 127 + .../python/test/unit/language/test_random.py | 273 + .../test/unit/language/test_reproducer.py | 38 + .../test/unit/language/test_standard.py | 193 + .../python/test/unit/language/test_tuple.py | 359 + .../test/unit/runtime/test_autotuner.py | 436 ++ .../python/test/unit/runtime/test_bindings.py | 112 + .../python/test/unit/runtime/test_blaslt.py | 196 + .../python/test/unit/runtime/test_build.py | 91 + .../python/test/unit/runtime/test_cache.py | 893 +++ .../unit/runtime/test_compilation_listener.py | 66 + .../python/test/unit/runtime/test_driver.py | 150 + .../python/test/unit/runtime/test_launch.py | 234 + .../unit/runtime/test_out_of_resources.py | 96 + .../triton/_C/libtriton/linear_layout.pyi | 80 + .../mthreads/python/triton/__init__.py | 83 + .../mthreads/python/triton/_filecheck.py | 116 + .../python/triton/_internal_testing.py | 293 + third_party/mthreads/python/triton/_utils.py | 157 + .../python/triton/backends/__init__.py | 66 + .../python/triton/backends/compiler.py | 92 + .../mthreads/python/triton/backends/driver.py | 66 + .../python/triton/compiler/__init__.py | 7 + .../python/triton/compiler/code_generator.py | 1670 +++++ .../python/triton/compiler/compiler.py | 513 ++ .../mthreads/python/triton/compiler/errors.py | 51 + .../python/triton/compiler/make_launcher.py | 0 third_party/mthreads/python/triton/errors.py | 5 + .../python/triton/experimental/__init__.py | 0 .../triton/experimental/gluon/__init__.py | 6 + .../triton/experimental/gluon/_compiler.py | 0 .../triton/experimental/gluon/_runtime.py | 102 + .../triton/experimental/gluon/amd/__init__.py | 3 + .../triton/experimental/gluon/amd/gfx1250.py | 45 + .../experimental/gluon/language/__init__.py | 137 + .../experimental/gluon/language/_core.py | 642 ++ .../experimental/gluon/language/_layouts.py | 704 ++ .../experimental/gluon/language/_math.py | 20 + .../experimental/gluon/language/_semantic.py | 607 ++ .../experimental/gluon/language/_standard.py | 81 + .../gluon/language/amd/__init__.py | 8 + .../gluon/language/amd/_layouts.py | 190 + .../experimental/gluon/language/amd/_ops.py | 77 + .../gluon/language/amd/cdna3/__init__.py | 238 + .../gluon/language/amd/cdna4/__init__.py | 130 + .../gluon/language/amd/cdna4/async_copy.py | 170 + .../gluon/language/amd/gfx1250/__init__.py | 98 + .../gluon/language/amd/gfx1250/async_copy.py | 78 + .../gluon/language/amd/gfx1250/cluster.py | 21 + .../gluon/language/amd/gfx1250/mbarrier.py | 67 + .../gluon/language/amd/gfx1250/tdm.py | 250 + .../gluon/language/amd/rdna3/__init__.py | 17 + .../gluon/language/amd/rdna4/__init__.py | 17 + .../gluon/language/amd/warp_pipeline.py | 62 + .../gluon/language/extra/__init__.py | 3 + .../gluon/language/nvidia/__init__.py | 4 + .../gluon/language/nvidia/ampere/__init__.py | 32 + .../language/nvidia/ampere/async_copy.py | 74 + .../gluon/language/nvidia/ampere/mbarrier.py | 121 + .../language/nvidia/blackwell/__init__.py | 571 ++ .../gluon/language/nvidia/blackwell/float2.py | 172 + .../gluon/language/nvidia/blackwell/tma.py | 74 + .../gluon/language/nvidia/hopper/__init__.py | 141 + .../gluon/language/nvidia/hopper/cluster.py | 25 + .../gluon/language/nvidia/hopper/mbarrier.py | 66 + .../gluon/language/nvidia/hopper/tma.py | 218 + .../experimental/gluon/nvidia/__init__.py | 4 + .../experimental/gluon/nvidia/blackwell.py | 3 + .../experimental/gluon/nvidia/hopper.py | 62 + third_party/mthreads/python/triton/knobs.py | 652 ++ .../python/triton/language/__init__.py | 360 + .../mthreads/python/triton/language/core.py | 3561 ++++++++++ .../python/triton/language/extra/__init__.py | 26 + .../python/triton/language/extra/libdevice.py | 790 ++ .../mthreads/python/triton/language/math.py | 249 + .../mthreads/python/triton/language/random.py | 218 + .../python/triton/language/semantic.py | 1996 ++++++ .../python/triton/language/standard.py | 547 ++ .../python/triton/language/target_info.py | 54 + .../python/triton/runtime/__init__.py | 23 + .../python/triton/runtime/_allocation.py | 64 + .../python/triton/runtime/_async_compile.py | 67 + .../python/triton/runtime/autotuner.py | 488 ++ .../mthreads/python/triton/runtime/build.py | 97 + .../mthreads/python/triton/runtime/cache.py | 317 + .../mthreads/python/triton/runtime/driver.py | 51 + .../mthreads/python/triton/runtime/errors.py | 46 + .../python/triton/runtime/interpreter.py | 1483 ++++ .../mthreads/python/triton/runtime/jit.py | 1134 +++ third_party/mthreads/python/triton/testing.py | 571 ++ .../mthreads/python/triton/tools/__init__.py | 1 + .../python/triton/tools/build_extern.py | 365 + .../mthreads/python/triton/tools/compile.py | 211 + .../mthreads/python/triton/tools/disasm.py | 143 + .../triton/tools/experimental_descriptor.py | 52 + .../mthreads/python/triton/tools/link.py | 335 + .../mthreads/python/triton/tools/mxfp.py | 301 + .../python/triton/tools/ragged_tma.py | 108 + .../python/triton/tools/tensor_descriptor.py | 36 + .../triton_to_gluon_translater/translator.py | 383 + .../translator_helpers.py | 618 ++ third_party/mthreads/triton_mthreads.cc | 189 + 766 files changed, 168021 insertions(+), 6 deletions(-) create mode 100644 python/setup_tools/utils/mthreads.py create mode 100644 third_party/mthreads/CMakeLists.txt create mode 100644 third_party/mthreads/backend/__init__.py create mode 100644 third_party/mthreads/backend/compiler.py create mode 100644 third_party/mthreads/backend/driver.c create mode 100644 third_party/mthreads/backend/driver.py create mode 100644 third_party/mthreads/backend/lib/libdevice.31.bc create mode 100644 third_party/mthreads/bin/CMakeLists.txt create mode 100644 third_party/mthreads/bin/RegisterTritonDialects.h create mode 100644 third_party/mthreads/bin/triton-llvm-opt.cpp create mode 100644 third_party/mthreads/bin/triton-opt.cpp create mode 100644 third_party/mthreads/bin/triton-reduce.cpp create mode 100644 third_party/mthreads/bin/triton-tensor-layout.cpp create mode 100644 third_party/mthreads/include/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Analysis/Alias.h create mode 100644 third_party/mthreads/include/triton/Analysis/Allocation.h create mode 100644 third_party/mthreads/include/triton/Analysis/AxisInfo.h create mode 100644 third_party/mthreads/include/triton/Analysis/BufferRegion.h create mode 100644 third_party/mthreads/include/triton/Analysis/Membar.h create mode 100644 third_party/mthreads/include/triton/Analysis/Utility.h create mode 100644 third_party/mthreads/include/triton/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Conversion/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Conversion/MLIRTypes.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/AsmFormat.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/FMADotUtility.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/Passes.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/Passes.td create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/Utility.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonGPUToLLVM/WarpSpecializeUtility.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonToTritonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Conversion/TritonToTritonGPU/Passes.h create mode 100644 third_party/mthreads/include/triton/Conversion/TritonToTritonGPU/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/CMakeCache.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/IR/GluonAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/IR/GluonDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/IR/GluonOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/Transforms/InferLayoutUtils.h create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/Transforms/Passes.h create mode 100644 third_party/mthreads/include/triton/Dialect/Gluon/Transforms/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVGPU/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/NVGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/NVGPU/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/NVGPU/IR/NVGPUAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVGPU/IR/NVGPUDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVGPU/IR/NVGPUOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/NVWSAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/NVWSDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/NVWSOpInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/NVWSOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/IR/NVWSTypes.td create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/Transforms/Passes.h create mode 100644 third_party/mthreads/include/triton/Dialect/NVWS/Transforms/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/DiscardableAttributes.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/Interfaces.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/OpInterfaces.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/Traits.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/TritonTypes.td create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/Types.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/IR/Utility.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/Transforms/ArithTypeConversion.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/Transforms/FunctionTypeConversion.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/Transforms/LoopPeeling.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/Transforms/Passes.h create mode 100644 third_party/mthreads/include/triton/Dialect/Triton/Transforms/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/Attributes.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/CGAEncodingAttr.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/CGAEncodingAttr.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/Traits.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrBase.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrImpls.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUEnums.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/IR/Types.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/CoalesceUtils.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/LayoutPropagationUtility.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/Partition.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/PartitionSchedulingUtility.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/Passes.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/Schedule.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/Utility.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/FunctionBuilder.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/TritonInstrument.md create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/TritonInstrumentAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/TritonInstrumentDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/IR/Utility.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/Transforms/Passes.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonInstrument/Transforms/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUTypes.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td create mode 100644 third_party/mthreads/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h create mode 100644 third_party/mthreads/include/triton/Target/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Target/LLVMIR/CMakeLists.txt create mode 100644 third_party/mthreads/include/triton/Target/LLVMIR/Passes.h create mode 100644 third_party/mthreads/include/triton/Target/LLVMIR/Passes.td create mode 100644 third_party/mthreads/include/triton/Tools/GenericSwizzling.h create mode 100644 third_party/mthreads/include/triton/Tools/LayoutUtils.h create mode 100644 third_party/mthreads/include/triton/Tools/LinearLayout.h create mode 100644 third_party/mthreads/include/triton/Tools/PluginUtils.h create mode 100644 third_party/mthreads/include/triton/Tools/StrUtil.h create mode 100644 third_party/mthreads/include/triton/Tools/Sys/GetEnv.hpp create mode 100644 third_party/mthreads/language/musa/__init__.py create mode 100644 third_party/mthreads/language/musa/libdevice.py create mode 100644 third_party/mthreads/language/musa/utils.py create mode 100644 third_party/mthreads/lib/Analysis/Alias.cpp create mode 100644 third_party/mthreads/lib/Analysis/Allocation.cpp create mode 100644 third_party/mthreads/lib/Analysis/AxisInfo.cpp create mode 100644 third_party/mthreads/lib/Analysis/BufferRegion.cpp create mode 100644 third_party/mthreads/lib/Analysis/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Analysis/Membar.cpp create mode 100644 third_party/mthreads/lib/Analysis/Utility.cpp create mode 100644 third_party/mthreads/lib/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Conversion/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemory.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMADotUtility.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/GlobalScratchMemoryAllocation.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/MakeRangeOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/PrintOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ReduceScanCommon.h create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/SPMDOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/Utility.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonGPUToLLVM/WarpSpecializeUtility.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonInstrumentToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Conversion/TritonInstrumentToLLVM/InstrumentationToLLVM.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonToTritonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonToTritonGPU/TritonGPUConversion.cpp create mode 100644 third_party/mthreads/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp create mode 100644 third_party/mthreads/lib/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Gluon/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Gluon/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Gluon/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/Canonicalize.cpp create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/InferCoalescedEncodings.cpp create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/InferLayoutUtils.cpp create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/Inline.cpp create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp create mode 100644 third_party/mthreads/lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVGPU/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/NVGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/NVGPU/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/NVWS/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/NVWS/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/IR/Ops.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/AssignStagePhase.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/HoistTmemStore.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/InsertAref.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/LowerAref.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/LowerWarpGroup.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/Utilities.cpp create mode 100644 third_party/mthreads/lib/Dialect/NVWS/Transforms/Utilities.h create mode 100644 third_party/mthreads/lib/Dialect/Triton/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/Canonicalize.td create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/DiscardableAttributes.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/OpInterfaces.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/Ops.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/Traits.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/Types.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/IR/Utility.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/ArithTypeConversion.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/Combine.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/Combine.td create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/FunctionTypeConversion.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/LoopAwareCSE.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/LoopInvariantCodeMotion.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/LoopPeeling.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/LoopUnroll.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/ReorderBroadcast.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp create mode 100644 third_party/mthreads/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/IR/Ops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/IR/Types.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/CoalesceUtils.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/CombineTensorSelectAndIf.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/F32DotTC.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/LayoutPropagationUtility.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/MMAv5PipelineUtility.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/TestPipelineLowerLoop.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/Utility.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionSchedulingUtility.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/IR/FunctionBuilder.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/IR/Ops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/IR/Utility.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonInstrument/Transforms/ConcurrencySanitizer.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/CheckMatmulTwoCTAs.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/InterleaveTMem.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeTMemLayouts.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/PlanCTA.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/ProxyFenceInsertion.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/RemoveTMEMTokens.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp create mode 100644 third_party/mthreads/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp create mode 100644 third_party/mthreads/lib/Target/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Target/LLVMIR/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Target/LLVMIR/LLVMDILocalVariable.cpp create mode 100644 third_party/mthreads/lib/Target/LLVMIR/LLVMDIScope.cpp create mode 100644 third_party/mthreads/lib/Target/LLVMIR/LLVMDIUtils.cpp create mode 100644 third_party/mthreads/lib/Target/LLVMIR/LLVMDIUtils.h create mode 100644 third_party/mthreads/lib/Target/LLVMIR/LLVMIRBreakPhiStruct.cpp create mode 100644 third_party/mthreads/lib/Target/LLVMIR/LLVMPasses.h create mode 100644 third_party/mthreads/lib/Tools/CMakeLists.txt create mode 100644 third_party/mthreads/lib/Tools/GenericSwizzling.cpp create mode 100644 third_party/mthreads/lib/Tools/LayoutUtils.cpp create mode 100644 third_party/mthreads/lib/Tools/LinearLayout.cpp create mode 100644 third_party/mthreads/lib/Tools/PluginUtils.cpp create mode 100644 third_party/mthreads/musa/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/Dialect/MTGPU/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/Dialect/MTGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/Dialect/MTGPU/IR/Dialect.h create mode 100644 third_party/mthreads/musa/include/Dialect/MTGPU/IR/MTGPUDialect.td create mode 100644 third_party/mthreads/musa/include/Dialect/MTGPU/IR/MTGPUOps.td create mode 100644 third_party/mthreads/musa/include/Dialect/MTGPU/IR/MTGPUTypes.td create mode 100644 third_party/mthreads/musa/include/Dialect/MUSA/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/Dialect/MUSA/IR/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/Dialect/MUSA/IR/Dialect.h create mode 100644 third_party/mthreads/musa/include/Dialect/MUSA/IR/MUSAAttrDefs.td create mode 100644 third_party/mthreads/musa/include/Dialect/MUSA/IR/MUSADialect.td create mode 100644 third_party/mthreads/musa/include/Dialect/MUSA/IR/MUSAOps.td create mode 100644 third_party/mthreads/musa/include/MTGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/MTGPUToLLVM/MTGPUToLLVMPass.h create mode 100644 third_party/mthreads/musa/include/MTGPUToLLVM/Passes.h create mode 100644 third_party/mthreads/musa/include/MTGPUToLLVM/Passes.td create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/BarrierUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/MMAContractUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/MMAEncodingUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/MMAOperandUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/MatmulPolicy.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/MemDescUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/SqmmaAttrUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSACommon/TMEUtils.h create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUToLLVM/Allocation.h create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUToLLVM/Passes.h create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUToLLVM/Passes.td create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUToLLVM/TargetInfo.h create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUToLLVM/Utility.h create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUTransforms/CMakeLists.txt create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUTransforms/Passes.h create mode 100644 third_party/mthreads/musa/include/TritonMUSAGPUTransforms/Passes.td create mode 100644 third_party/mthreads/musa/lib/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/Dialect/MTGPU/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/Dialect/MTGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/Dialect/MTGPU/IR/Dialect.cpp create mode 100644 third_party/mthreads/musa/lib/Dialect/MUSA/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/Dialect/MUSA/IR/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/Dialect/MUSA/IR/Dialect.cpp create mode 100644 third_party/mthreads/musa/lib/MTGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/MTGPUToLLVM/MTGPUToLLVMPass.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/AllocateSharedMemory.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/BarrierOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/ConvertLayoutOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/DotOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/DotOpToLLVM/DotOpToLLVM.h create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/DotOpToLLVM/SQMMA.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/DotOpToLLVM/WMMA.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/ElementwiseOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/LoadStoreOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/MUSAOpsToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/PatternTritonGPUOpToLLVM.h create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/SPMDOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/TargetInfo.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/TensorPtrOpsToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/ThreadIdOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/TritonGPUToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/Utility.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUToLLVM/WarpIdOpToLLVM.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/AccelerateMUSAMatmul.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/CMakeLists.txt create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/CanonicalizeSqmmaResultConversions.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/ConvertSqmmaToMTGPU.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/FinalizeBarriers.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/IssueBarrierInsertion.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/MarkInplaceLoads.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/OptimizeAccumulatorInit.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/OptimizeDescriptorEncoding.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/OptimizeDotOperands.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/OptimizeSqmmaAccumulatorLayout.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/Pipeline.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/SqmmaPipelineUtils.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/SqmmaPipelineUtils.h create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/TMELowering.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/TMEPipelineUtils.cpp create mode 100644 third_party/mthreads/musa/lib/TritonMUSAGPUTransforms/TMEPipelineUtils.h create mode 100644 third_party/mthreads/proton/.gitignore create mode 100644 third_party/mthreads/proton/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Analysis/ScopeIdAllocation.h create mode 100644 third_party/mthreads/proton/Dialect/include/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/Passes.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/Passes.td create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/PatternProtonGPUOpToLLVM.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AMDPatternProtonGPUOpToLLVM.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/Passes.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/Passes.td create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/TargetInfo.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/NvidiaPatternProtonGPUOpToLLVM.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/Passes.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/Passes.td create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/TargetInfo.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/TargetInfoBase.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonGPUToLLVM/Utility.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonToProtonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonToProtonGPU/Passes.h create mode 100644 third_party/mthreads/proton/Dialect/include/Conversion/ProtonToProtonGPU/Passes.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/Proton/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/Proton/IR/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/Proton/IR/Dialect.h create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/Proton/IR/ProtonDialect.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/Proton/IR/ProtonOps.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/Dialect.h create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/ProtonGPUAttrDefs.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/ProtonGPUDialect.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/ProtonGPUOps.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/ProtonGPUTypes.td create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/IR/Types.h create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/Transforms/Passes.h create mode 100644 third_party/mthreads/proton/Dialect/include/Dialect/ProtonGPU/Transforms/Passes.td create mode 100644 third_party/mthreads/proton/Dialect/include/compat/PTXAsmFormat.h create mode 100644 third_party/mthreads/proton/Dialect/include/compat/TargetInfo.h create mode 100644 third_party/mthreads/proton/Dialect/include/compat/Utility.h create mode 100644 third_party/mthreads/proton/Dialect/lib/Analysis/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Analysis/ScopeIdAllocation.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/Proton/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/Proton/IR/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/Proton/IR/Dialect.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/Proton/IR/Ops.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/IR/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/IR/Dialect.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/IR/Ops.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/IR/Types.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/Transforms/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/Dialect/ProtonGPU/Transforms/ProtonGPUTransformsPass.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/AllocateProtonGlobalScratchBuffer.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/AllocateProtonSharedMemory.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/PatternProtonGPUOpToLLVM.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AMDPatternProtonGPUOpToLLVM.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AddSchedBarriers.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/ConvertProtonGPUToLLVM.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/TargetInfo.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/ConvertProtonGPUToLLVM.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/NvidiaPatternProtonGPUOpToLLVM.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/TargetInfo.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonGPUToLLVM/Utility.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonToProtonGPU/CMakeLists.txt create mode 100644 third_party/mthreads/proton/Dialect/lib/ProtonToProtonGPU/ProtonToProtonGPUPass.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/compat/PTXAsmFormat.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/compat/TargetInfo.cpp create mode 100644 third_party/mthreads/proton/Dialect/lib/compat/Utility.cpp create mode 100644 third_party/mthreads/proton/Dialect/triton_proton.cc create mode 100644 third_party/mthreads/proton/README.md create mode 100644 third_party/mthreads/proton/common/CMakeLists.txt create mode 100644 third_party/mthreads/proton/common/include/Device.h create mode 100644 third_party/mthreads/proton/common/include/TraceDataIO/ByteSpan.h create mode 100644 third_party/mthreads/proton/common/include/TraceDataIO/CircularLayoutParser.h create mode 100644 third_party/mthreads/proton/common/include/TraceDataIO/EntryDecoder.h create mode 100644 third_party/mthreads/proton/common/include/TraceDataIO/Parser.h create mode 100644 third_party/mthreads/proton/common/include/TraceDataIO/TraceWriter.h create mode 100644 third_party/mthreads/proton/common/lib/CMakeLists.txt create mode 100644 third_party/mthreads/proton/common/lib/TraceDataIO/ByteSpan.cpp create mode 100644 third_party/mthreads/proton/common/lib/TraceDataIO/CMakeLists.txt create mode 100644 third_party/mthreads/proton/common/lib/TraceDataIO/CircularLayoutParser.cpp create mode 100644 third_party/mthreads/proton/common/lib/TraceDataIO/EntryDecoder.cpp create mode 100644 third_party/mthreads/proton/common/lib/TraceDataIO/Parser.cpp create mode 100644 third_party/mthreads/proton/common/lib/TraceDataIO/TraceWriter.cpp create mode 100644 third_party/mthreads/proton/csrc/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/Proton.cpp create mode 100644 third_party/mthreads/proton/csrc/include/Context/Context.h create mode 100644 third_party/mthreads/proton/csrc/include/Context/Python.h create mode 100644 third_party/mthreads/proton/csrc/include/Context/Shadow.h create mode 100644 third_party/mthreads/proton/csrc/include/Data/Data.h create mode 100644 third_party/mthreads/proton/csrc/include/Data/Metric.h create mode 100644 third_party/mthreads/proton/csrc/include/Data/PhaseStore.h create mode 100644 third_party/mthreads/proton/csrc/include/Data/TraceData.h create mode 100644 third_party/mthreads/proton/csrc/include/Data/TreeData.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/Dispatch.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/GPU/CudaApi.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/GPU/CuptiApi.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/GPU/HipApi.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/GPU/HsaApi.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/GPU/NvtxApi.h create mode 100644 third_party/mthreads/proton/csrc/include/Driver/GPU/RoctracerApi.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/GPUProfiler.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Graph.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Instrumentation/InstrumentationProfiler.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Instrumentation/Metadata.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Profiler.h create mode 100644 third_party/mthreads/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h create mode 100644 third_party/mthreads/proton/csrc/include/Proton.h create mode 100644 third_party/mthreads/proton/csrc/include/Runtime/CudaRuntime.h create mode 100644 third_party/mthreads/proton/csrc/include/Runtime/HipRuntime.h create mode 100644 third_party/mthreads/proton/csrc/include/Runtime/Runtime.h create mode 100644 third_party/mthreads/proton/csrc/include/Session/Session.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Atomic.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Env.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Errors.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Map.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/MsgPackWriter.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Numeric.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Set.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Singleton.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/String.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Table.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Traits.h create mode 100644 third_party/mthreads/proton/csrc/include/Utility/Vector.h create mode 100644 third_party/mthreads/proton/csrc/lib/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Context/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Context/Context.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Context/Python.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Context/Shadow.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Data/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Data/Data.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Data/Metric.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Data/TraceData.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Data/TreeData.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/Device.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/GPU/CudaApi.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/GPU/CuptiApi.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/GPU/HipApi.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/GPU/HsaApi.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/GPU/NvtxApi.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Driver/GPU/RoctracerApi.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/GPUProfiler.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/Graph.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/Instrumentation/InstrumentationProfiler.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/Instrumentation/Metadata.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/Profiler.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Runtime/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Runtime/CudaRuntime.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Runtime/HipRuntime.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Session/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Session/Session.cpp create mode 100644 third_party/mthreads/proton/csrc/lib/Utility/CMakeLists.txt create mode 100644 third_party/mthreads/proton/csrc/lib/Utility/MsgPackWriter.cpp create mode 100644 third_party/mthreads/proton/proton/__init__.py create mode 100644 third_party/mthreads/proton/proton/context.py create mode 100644 third_party/mthreads/proton/proton/data.py create mode 100644 third_party/mthreads/proton/proton/flags.py create mode 100644 third_party/mthreads/proton/proton/hooks/__init__.py create mode 100644 third_party/mthreads/proton/proton/hooks/hook.py create mode 100644 third_party/mthreads/proton/proton/hooks/instrumentation.py create mode 100644 third_party/mthreads/proton/proton/hooks/launch.py create mode 100644 third_party/mthreads/proton/proton/language.py create mode 100644 third_party/mthreads/proton/proton/metric.py create mode 100644 third_party/mthreads/proton/proton/mode.py create mode 100644 third_party/mthreads/proton/proton/profile.py create mode 100644 third_party/mthreads/proton/proton/proton.py create mode 100644 third_party/mthreads/proton/proton/scope.py create mode 100644 third_party/mthreads/proton/proton/specs.py create mode 100644 third_party/mthreads/proton/proton/state.py create mode 100644 third_party/mthreads/proton/proton/viewer.py create mode 100755 third_party/mthreads/proton/scripts/dump_ttgir.sh create mode 100644 third_party/mthreads/proton/test/CMakeLists.txt create mode 100644 third_party/mthreads/proton/test/conftest.py create mode 100644 third_party/mthreads/proton/test/examples/cuda.json create mode 100644 third_party/mthreads/proton/test/examples/frame.json create mode 100644 third_party/mthreads/proton/test/examples/hip.json create mode 100644 third_party/mthreads/proton/test/examples/leaf_nodes.json create mode 100644 third_party/mthreads/proton/test/examples/triton.json create mode 100644 third_party/mthreads/proton/test/helper.py create mode 100644 third_party/mthreads/proton/test/helper_kernels.py create mode 100644 third_party/mthreads/proton/test/override_helper.py create mode 100644 third_party/mthreads/proton/test/test_api.py create mode 100644 third_party/mthreads/proton/test/test_cmd.py create mode 100644 third_party/mthreads/proton/test/test_instrumentation.py create mode 100644 third_party/mthreads/proton/test/test_lib.py create mode 100644 third_party/mthreads/proton/test/test_override.py create mode 100644 third_party/mthreads/proton/test/test_profile.py create mode 100644 third_party/mthreads/proton/test/test_viewer.py create mode 100644 third_party/mthreads/proton/test/unittest/CMakeLists.txt create mode 100644 third_party/mthreads/proton/test/unittest/TraceDataIO/ByteSpanTest.cpp create mode 100644 third_party/mthreads/proton/test/unittest/TraceDataIO/CMakeLists.txt create mode 100644 third_party/mthreads/proton/test/unittest/TraceDataIO/ChromeTraceWriterTest.cpp create mode 100644 third_party/mthreads/proton/test/unittest/TraceDataIO/CircularLayoutParserTest.cpp create mode 100644 third_party/mthreads/proton/test/unittest/TraceDataIO/DecoderTest.cpp create mode 100644 third_party/mthreads/proton/test/unittest/util/loop.bin create mode 100644 third_party/mthreads/proton/test/unittest/util/seq.bin create mode 100644 third_party/mthreads/proton/test/unittest/util/trace_gen.py create mode 100644 third_party/mthreads/proton/tutorials/dynamic-net.py create mode 100644 third_party/mthreads/proton/tutorials/intra_kernel/README.md create mode 100644 third_party/mthreads/proton/tutorials/intra_kernel/example_dsl.py create mode 100644 third_party/mthreads/proton/tutorials/intra_kernel/example_override.py create mode 100755 third_party/mthreads/proton/tutorials/intra_kernel/insert_proton_records create mode 100644 third_party/mthreads/proton/tutorials/matmul.py create mode 100644 third_party/mthreads/python/src/gluon_ir.cc create mode 100644 third_party/mthreads/python/src/interpreter.cc create mode 100644 third_party/mthreads/python/src/ir.cc create mode 100644 third_party/mthreads/python/src/ir.h create mode 100644 third_party/mthreads/python/src/linear_layout.cc create mode 100644 third_party/mthreads/python/src/llvm.cc create mode 100644 third_party/mthreads/python/src/main.cc create mode 100644 third_party/mthreads/python/src/passes.cc create mode 100644 third_party/mthreads/python/src/passes.h create mode 100644 third_party/mthreads/python/src/specialize.cc create mode 100644 third_party/mthreads/python/test/conftest.py create mode 100644 third_party/mthreads/python/test/unit/language/print_helper.py create mode 100644 third_party/mthreads/python/test/unit/language/test_annotations.py create mode 100644 third_party/mthreads/python/test/unit/language/test_block_pointer.py create mode 100644 third_party/mthreads/python/test/unit/language/test_compile_errors.py create mode 100644 third_party/mthreads/python/test/unit/language/test_compile_only.py create mode 100644 third_party/mthreads/python/test/unit/language/test_conversions.py create mode 100644 third_party/mthreads/python/test/unit/language/test_core.py create mode 100644 third_party/mthreads/python/test/unit/language/test_decorator.py create mode 100644 third_party/mthreads/python/test/unit/language/test_frontend.py create mode 100644 third_party/mthreads/python/test/unit/language/test_libdevice.py create mode 100644 third_party/mthreads/python/test/unit/language/test_line_info.py create mode 100644 third_party/mthreads/python/test/unit/language/test_module.py create mode 100644 third_party/mthreads/python/test/unit/language/test_musa_ut_056.py create mode 100644 third_party/mthreads/python/test/unit/language/test_mxfp.py create mode 100644 third_party/mthreads/python/test/unit/language/test_random.py create mode 100644 third_party/mthreads/python/test/unit/language/test_reproducer.py create mode 100644 third_party/mthreads/python/test/unit/language/test_standard.py create mode 100644 third_party/mthreads/python/test/unit/language/test_tuple.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_autotuner.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_bindings.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_blaslt.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_build.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_cache.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_compilation_listener.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_driver.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_launch.py create mode 100644 third_party/mthreads/python/test/unit/runtime/test_out_of_resources.py create mode 100644 third_party/mthreads/python/triton/_C/libtriton/linear_layout.pyi create mode 100644 third_party/mthreads/python/triton/__init__.py create mode 100644 third_party/mthreads/python/triton/_filecheck.py create mode 100644 third_party/mthreads/python/triton/_internal_testing.py create mode 100644 third_party/mthreads/python/triton/_utils.py create mode 100644 third_party/mthreads/python/triton/backends/__init__.py create mode 100644 third_party/mthreads/python/triton/backends/compiler.py create mode 100644 third_party/mthreads/python/triton/backends/driver.py create mode 100644 third_party/mthreads/python/triton/compiler/__init__.py create mode 100644 third_party/mthreads/python/triton/compiler/code_generator.py create mode 100644 third_party/mthreads/python/triton/compiler/compiler.py create mode 100644 third_party/mthreads/python/triton/compiler/errors.py create mode 100644 third_party/mthreads/python/triton/compiler/make_launcher.py create mode 100644 third_party/mthreads/python/triton/errors.py create mode 100644 third_party/mthreads/python/triton/experimental/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/_compiler.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/_runtime.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/amd/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/amd/gfx1250.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/_core.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/_layouts.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/_math.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/_semantic.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/_standard.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/_layouts.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/_ops.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/cdna3/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/cdna4/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/cdna4/async_copy.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/gfx1250/async_copy.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/gfx1250/cluster.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/gfx1250/mbarrier.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/gfx1250/tdm.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/rdna3/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/rdna4/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/amd/warp_pipeline.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/extra/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/ampere/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/ampere/async_copy.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/ampere/mbarrier.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/blackwell/float2.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/blackwell/tma.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/hopper/cluster.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/hopper/mbarrier.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/language/nvidia/hopper/tma.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/nvidia/__init__.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/nvidia/blackwell.py create mode 100644 third_party/mthreads/python/triton/experimental/gluon/nvidia/hopper.py create mode 100644 third_party/mthreads/python/triton/knobs.py create mode 100644 third_party/mthreads/python/triton/language/__init__.py create mode 100644 third_party/mthreads/python/triton/language/core.py create mode 100644 third_party/mthreads/python/triton/language/extra/__init__.py create mode 100644 third_party/mthreads/python/triton/language/extra/libdevice.py create mode 100644 third_party/mthreads/python/triton/language/math.py create mode 100644 third_party/mthreads/python/triton/language/random.py create mode 100644 third_party/mthreads/python/triton/language/semantic.py create mode 100644 third_party/mthreads/python/triton/language/standard.py create mode 100644 third_party/mthreads/python/triton/language/target_info.py create mode 100644 third_party/mthreads/python/triton/runtime/__init__.py create mode 100644 third_party/mthreads/python/triton/runtime/_allocation.py create mode 100644 third_party/mthreads/python/triton/runtime/_async_compile.py create mode 100644 third_party/mthreads/python/triton/runtime/autotuner.py create mode 100644 third_party/mthreads/python/triton/runtime/build.py create mode 100644 third_party/mthreads/python/triton/runtime/cache.py create mode 100644 third_party/mthreads/python/triton/runtime/driver.py create mode 100644 third_party/mthreads/python/triton/runtime/errors.py create mode 100644 third_party/mthreads/python/triton/runtime/interpreter.py create mode 100644 third_party/mthreads/python/triton/runtime/jit.py create mode 100644 third_party/mthreads/python/triton/testing.py create mode 100644 third_party/mthreads/python/triton/tools/__init__.py create mode 100644 third_party/mthreads/python/triton/tools/build_extern.py create mode 100644 third_party/mthreads/python/triton/tools/compile.py create mode 100644 third_party/mthreads/python/triton/tools/disasm.py create mode 100644 third_party/mthreads/python/triton/tools/experimental_descriptor.py create mode 100644 third_party/mthreads/python/triton/tools/link.py create mode 100644 third_party/mthreads/python/triton/tools/mxfp.py create mode 100644 third_party/mthreads/python/triton/tools/ragged_tma.py create mode 100644 third_party/mthreads/python/triton/tools/tensor_descriptor.py create mode 100644 third_party/mthreads/python/triton/tools/triton_to_gluon_translater/translator.py create mode 100644 third_party/mthreads/python/triton/tools/triton_to_gluon_translater/translator_helpers.py create mode 100644 third_party/mthreads/triton_mthreads.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index cb0c19d7ee..1d6b909fe7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,8 @@ elseif(FLAGTREE_BACKEND STREQUAL "mthreads") set(ENV{PATH} "$ENV{LLVM_SYSPATH}/bin:$ENV{PATH}") set(CMAKE_C_COMPILER clang) set(CMAKE_CXX_COMPILER clang++) - set(ENV{FLAGTREE_PLUGIN} $ENV{FLAGTREE_BACKEND}) + set(FLAGTREE_TLE OFF) + remove_definitions(-D__TLE__) elseif(FLAGTREE_BACKEND STREQUAL "aipu") set(CMAKE_C_COMPILER clang-16) set(CMAKE_CXX_COMPILER clang++-16) @@ -281,14 +282,19 @@ if(TRITON_BUILD_PYTHON_MODULE) include_directories(${PROJECT_BINARY_DIR}/third_party/${FLAGTREE_BACKEND}) add_subdirectory(third_party/hcu/proton/Dialect) add_subdirectory(third_party/nvidia) + elseif(FLAGTREE_BACKEND AND FLAGTREE_BACKEND STREQUAL "mthreads") + include_directories(${PROJECT_BINARY_DIR}/third_party/${FLAGTREE_BACKEND}) + add_subdirectory(third_party/mthreads/proton/Dialect) else() list(APPEND TRITON_PLUGIN_NAMES "proton") add_subdirectory(third_party/proton/Dialect) endif() # Add TLE plugin - list(APPEND TRITON_PLUGIN_NAMES "tle") - add_subdirectory(third_party/tle) + if(FLAGTREE_TLE) + list(APPEND TRITON_PLUGIN_NAMES "tle") + add_subdirectory(third_party/tle) + endif() if (DEFINED TRITON_PLUGIN_DIRS) foreach(PLUGIN_DIR ${TRITON_PLUGIN_DIRS}) @@ -499,7 +505,9 @@ if(NOT TRITON_BUILD_PYTHON_MODULE) endforeach() add_subdirectory(third_party/proton/Dialect) # flagtree tle - add_subdirectory(third_party/tle) + if(FLAGTREE_TLE) + add_subdirectory(third_party/tle) + endif() endif() find_package(Threads REQUIRED) diff --git a/python/setup_tools/utils/mthreads.py b/python/setup_tools/utils/mthreads.py new file mode 100644 index 0000000000..13ed22615b --- /dev/null +++ b/python/setup_tools/utils/mthreads.py @@ -0,0 +1,123 @@ +import sys +import shutil +import inspect +from pathlib import Path + +from setuptools import find_packages + +MTHREADS_PYTHON_ROOT = "third_party/mthreads/python" +FLAGTREE_PYTHON_ROOT = "python" +TLE_PACKAGE = "triton.experimental.tle" + + +def skip_package_dir(package): + return package == "triton" or package.startswith("triton.") + + +def get_package_dir(): + return { + "": MTHREADS_PYTHON_ROOT, + } + + +def _is_backend_package(package): + return package == "triton.backends" or package.startswith("triton.backends.") + + +def _is_language_extra_package(package): + return package == "triton.language.extra" or package.startswith("triton.language.extra.") + + +def _merge_mthreads_packages(existing_packages): + packages = [] + seen = set() + + def add(package): + if package not in seen: + packages.append(package) + seen.add(package) + + for package in find_packages(where=MTHREADS_PYTHON_ROOT, include=["triton", "triton.*"]): + add(package) + + for package in find_packages(where=FLAGTREE_PYTHON_ROOT, include=[TLE_PACKAGE, f"{TLE_PACKAGE}.*"]): + add(package) + + for package in existing_packages: + if (not package.startswith("triton.") or _is_backend_package(package) or _is_language_extra_package(package) + or package == "triton.profiler" or package.startswith("triton.profiler.")): + add(package) + + return packages + + +def _merge_mthreads_package_dir(existing_package_dir): + package_dir = dict(existing_package_dir or {}) + package_dir[""] = MTHREADS_PYTHON_ROOT + + for package in find_packages(where=MTHREADS_PYTHON_ROOT, include=["triton", "triton.*"]): + rel_package_path = package.replace(".", "/") + package_dir[package] = f"{MTHREADS_PYTHON_ROOT}/{rel_package_path}" + + for package in find_packages(where=FLAGTREE_PYTHON_ROOT, include=[TLE_PACKAGE, f"{TLE_PACKAGE}.*"]): + rel_package_path = package.replace(".", "/") + package_dir[package] = f"{FLAGTREE_PYTHON_ROOT}/{rel_package_path}" + + return package_dir + + +def _patch_mthreads_cmdclass(existing_cmdclass): + cmdclass = dict(existing_cmdclass or {}) + original_build_py = cmdclass.get("build_py") + if original_build_py is None: + return cmdclass + + class MthreadsBuildPy(original_build_py): + + def run(self): + self.force = True + build_triton_dir = Path(self.build_lib) / "triton" + if build_triton_dir.exists(): + shutil.rmtree(build_triton_dir) + return super().run() + + cmdclass["build_py"] = MthreadsBuildPy + return cmdclass + + +def _wrap_setup(original_setup): + if getattr(original_setup, "_mthreads_python_root_patched", False): + return original_setup + + def setup_with_mthreads_python_root(*args, **kwargs): + kwargs["packages"] = _merge_mthreads_packages(kwargs.get("packages", [])) + kwargs["package_dir"] = _merge_mthreads_package_dir(kwargs.get("package_dir", {})) + kwargs["cmdclass"] = _patch_mthreads_cmdclass(kwargs.get("cmdclass", {})) + return original_setup(*args, **kwargs) + + setup_with_mthreads_python_root._mthreads_python_root_patched = True + setup_with_mthreads_python_root._mthreads_original_setup = original_setup + return setup_with_mthreads_python_root + + +def _patch_setup_for_mthreads_python_root(): + patched = False + + frame = inspect.currentframe() + while frame is not None: + setup_func = frame.f_globals.get("setup") + if callable(setup_func): + frame.f_globals["setup"] = _wrap_setup(setup_func) + patched = True + frame = frame.f_back + + main_module = sys.modules.get("__main__") + if main_module is not None and hasattr(main_module, "setup"): + main_module.setup = _wrap_setup(main_module.setup) + patched = True + + if not patched: + raise RuntimeError("mthreads setup hook could not find setup() to patch") + + +_patch_setup_for_mthreads_python_root() diff --git a/setup.py b/setup.py index 4ea6801692..dee1fed33e 100644 --- a/setup.py +++ b/setup.py @@ -704,8 +704,6 @@ def get_packages(): if helper.flagtree_backend == "xpu": yield f"triton.language.extra.xpu" - elif helper.flagtree_backend == "mthreads": - yield f"triton/language/extra/musa" if check_env_flag("TRITON_BUILD_PROTON", "ON"): # Default ON yield "triton.profiler" diff --git a/third_party/mthreads/CMakeLists.txt b/third_party/mthreads/CMakeLists.txt new file mode 100644 index 0000000000..61a004cd8d --- /dev/null +++ b/third_party/mthreads/CMakeLists.txt @@ -0,0 +1,23 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/musa/include) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/musa/include) +add_subdirectory(include) +add_subdirectory(lib) +add_subdirectory(musa) +if(TRITON_BUILD_PYTHON_MODULE) + add_triton_plugin(TritonMthreads ${CMAKE_CURRENT_SOURCE_DIR}/triton_mthreads.cc + LINK_LIBS TritonMUSAGPUToLLVM MTGPUToLLVM TritonMUSAGPUTransforms) + add_dependencies(TritonMthreads + MUSATableGen + MUSAAttrDefsIncGen + MTGPUTableGen + MTGPUTypesIncGen + MTGPUConversionPassIncGen + TritonMUSAGPUConversionPassIncGen + TritonMUSAGPUTransformsIncGen) + target_link_libraries(TritonMthreads PRIVATE Python3::Module pybind11::headers) +endif() +add_subdirectory(bin) diff --git a/third_party/mthreads/backend/__init__.py b/third_party/mthreads/backend/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/mthreads/backend/compiler.py b/third_party/mthreads/backend/compiler.py new file mode 100644 index 0000000000..5492e596fb --- /dev/null +++ b/third_party/mthreads/backend/compiler.py @@ -0,0 +1,933 @@ +from triton.backends.compiler import BaseBackend, GPUTarget, Language +from triton._C.libtriton import ir, passes, mthreads +from triton import knobs + +from dataclasses import dataclass +from pathlib import Path +import functools +from typing import Any, Dict, Tuple, Optional +import hashlib +import os +import re +import shutil +import shlex +import subprocess +import tempfile + + +def min_dot_size(target: GPUTarget): + + def check_dot_compatibility(lhs_type, rhs_type) -> Tuple[int, int, int]: + lhs_bitwidth = lhs_type.scalar.primitive_bitwidth + rhs_bitwidth = rhs_type.scalar.primitive_bitwidth + assert lhs_bitwidth == rhs_bitwidth, "lhs and rhs bitwidth must be the same" + return (1, 1, 1) + + return check_dot_compatibility + + +def _module_text(mod) -> str: + try: + return str(mod) + except Exception: + return "" + + +def _module_uses_sqmma(mod) -> bool: + text = _module_text(mod) + return "mtgpu.sqmma" in text + + +@functools.lru_cache() +def get_musa_version() -> str: + if env_ver := os.getenv("TRITON_MUSA_VERSION"): + return env_ver + try: + import torch_musa # type: ignore + return getattr(torch_musa, "__version__", "unknown") + except Exception: + return "unknown" + + +@functools.lru_cache(None) +def file_hash(path: str) -> str: + with open(path, "rb") as f: + return hashlib.sha256(f.read()).hexdigest() + + +@functools.lru_cache(None) +def _tool_version_signature(path: str) -> str: + norm = _normalize_path(path) + if not norm: + return "" + tool_path = str(Path(norm).expanduser()) + version_text = "" + try: + out = subprocess.check_output([tool_path, "--version"], stderr=subprocess.STDOUT, text=True) + version_text = out.strip() + except Exception: + version_text = "" + binary_hash = "" + try: + if Path(tool_path).exists(): + binary_hash = file_hash(tool_path) + except Exception: + binary_hash = "" + return f"{tool_path}|{version_text}|{binary_hash}" + + +def _normalize_arch(arch: object) -> str: + if isinstance(arch, int): + return str(arch) + return str(arch).lower() + + +def _capability_from_arch(arch: object) -> int: + if isinstance(arch, int): + return arch + arch_str = _normalize_arch(arch) + if arch_str.isdigit(): + return int(arch_str) + if arch_str.startswith("ph1"): + return 31 + raise ValueError(f"Unsupported MUSA arch: {arch}") + + +def _normalize_path(path: Optional[str]) -> Optional[str]: + if not path: + return None + return str(Path(path).expanduser()) + + +def _maybe_tool_path(tool) -> Optional[str]: + try: + return _normalize_path(tool.path) + except Exception: + return None + + +def _select_tool_path(explicit_path: Optional[str], tool) -> Optional[str]: + path = _normalize_path(explicit_path) + if path: + return path + return _maybe_tool_path(tool) + + +def _resolve_toolchain_paths(options: "MUSAOptions") -> Tuple[str, str, Optional[str]]: + toolchain_path = _normalize_path(options.toolchain_path) + llc_path = _normalize_path(options.llc_path) + lld_path = _normalize_path(options.lld_path) + llc_asm_path = _normalize_path(options.llc_asm_path) + + if not toolchain_path: + mtcc_bin_path = os.getenv("MTCC_BIN_PATH") + if mtcc_bin_path: + toolchain_path = str(Path(mtcc_bin_path).expanduser()) + if not toolchain_path: + musa_home = os.getenv("MUSA_HOME") + if musa_home: + toolchain_path = str(Path(musa_home).expanduser() / "bin") + + if not llc_path and toolchain_path: + llc_path = str(Path(toolchain_path) / "llc") + if not lld_path and toolchain_path: + lld_path = str(Path(toolchain_path) / "ld.lld") + + return llc_path or "", lld_path or "", llc_asm_path + + +@functools.lru_cache(None) +def _detect_llvm_major_version(llc_path: str) -> Optional[int]: + llc = str(Path(llc_path).expanduser()) if llc_path else "" + if not llc: + return None + try: + out = subprocess.check_output([llc, "--version"], stderr=subprocess.STDOUT, text=True) + except Exception: + return None + match = re.search(r"LLVM version\s+(\d+)\.", out) + if not match: + return None + try: + return int(match.group(1)) + except Exception: + return None + + +def _tool_output(stdout: Optional[str], stderr: Optional[str]) -> str: + chunks = [] + if stdout and stdout.strip(): + chunks.append(stdout.strip()) + if stderr and stderr.strip(): + chunks.append(stderr.strip()) + return "\n".join(chunks) + + +def _run_tool_command(tool_name: str, cmd: list[str], *, repro_dir: Path, dump_log: bool = False) -> None: + proc = subprocess.run(cmd, check=False, text=True, capture_output=True) + output = _tool_output(proc.stdout, proc.stderr) + if dump_log and output: + print(f"// -----// MUSA {tool_name} Log //----- //") + print(output) + if proc.returncode == 0: + return + error = (f"`{tool_name}` failed with error code {proc.returncode}\n" + f"`{tool_name}` output:\n{output or ''}\n" + f"Repro command: {shlex.join(cmd)}\n" + f"Artifacts kept in: {repro_dir}") + raise RuntimeError(error) + + +def _should_apply_llvm_compat(llc_major: Optional[int]) -> bool: + return llc_major is None or llc_major < 19 + + +def _llc_opaque_pointer_options(llc_major: Optional[int]) -> list[str]: + return ["--opaque-pointers"] if llc_major is not None and llc_major < 15 else [] + + +def _strip_range_attributes(ir_text: str) -> str: + out = ir_text + pos = 0 + call_ret_re = re.compile(r"[^,\n@][^,\n@]*\s+@[A-Za-z_$.][A-Za-z0-9_$.]*\s*\(") + while True: + start = out.find("range(", pos) + if start < 0: + break + cur = start + len("range(") + depth = 1 + while cur < len(out) and depth > 0: + ch = out[cur] + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + cur += 1 + if depth != 0: + pos = start + 1 + continue + end = cur + while end < len(out) and out[end].isspace(): + end += 1 + tail = out[end:] + if end < len(out) and (out[end] == "%" or call_ret_re.match(tail)): + out = out[:start] + out[end:] + pos = start + else: + pos = end + return out + + +def _rewrite_bare_splat_operands(ir_text: str) -> str: + vec_re = re.compile(r"<\s*(\d+)\s+x\s*([A-Za-z0-9_.]+)\s*>") + bare_splat_re = re.compile(r"splat\s*\(\s*([A-Za-z0-9_.]+)\s+([^)]+)\s*\)") + + out_lines = [] + for line in ir_text.splitlines(): + search_pos = 0 + while search_pos < len(line): + match = bare_splat_re.search(line, search_pos) + if match is None: + break + elem_ty = match.group(1) + elem_val = match.group(2).strip() + prefix = line[:match.start()] + + lane_count = -1 + for vec_match in vec_re.finditer(prefix): + if vec_match.group(2) == elem_ty: + lane_count = int(vec_match.group(1)) + + if lane_count <= 0: + search_pos = match.end() + continue + + lane_str = str(lane_count) + vec_ty = f"<{lane_str} x {elem_ty}>" + mask_ty = f"<{lane_str} x i32>" + insert_expr = (f"insertelement ({vec_ty} undef, {elem_ty} {elem_val}, i32 0)") + replacement = (f"shufflevector ({vec_ty} {insert_expr}, {vec_ty} undef, " + f"{mask_ty} zeroinitializer)") + + line = line[:match.start()] + replacement + line[match.end():] + search_pos = match.start() + len(replacement) + + out_lines.append(line) + return "\n".join(out_lines) + ("\n" if ir_text.endswith("\n") else "") + + +def _rewrite_musa_isspacep_shared(ir_text: str) -> str: + call_re = re.compile( + r"^([ \t]*)(%[A-Za-z0-9_.]+|%\d+)\s*=\s*(?:tail\s+)?call\s+i1\s+" + r"@llvm\.musa\.isspacep\.shared\s*\(\s*ptr(?:\s+[^()%]+)*\s+(%[A-Za-z0-9_.]+|%\d+)\s*\)\s*(,.*)?$") + + def _tmp_name(base_pred: str, kind: str) -> str: + if re.fullmatch(r"%\d+", base_pred): + return f"%musa_isspacep_{kind}_{base_pred[1:]}" + return f"{base_pred}.isspacep.{kind}" + + out_lines = [] + for line in ir_text.splitlines(): + m = call_re.match(line) + if m is None: + out_lines.append(line) + continue + + indent, pred_name, ptr_name, dbg_suffix = m.groups() + dbg_suffix = dbg_suffix or "" + ptr_i64 = _tmp_name(pred_name, "i64") + ptr_hi32 = _tmp_name(pred_name, "hi32") + out_lines.append(f"{indent}{ptr_i64} = ptrtoint ptr {ptr_name} to i64{dbg_suffix}") + out_lines.append(f"{indent}{ptr_hi32} = lshr i64 {ptr_i64}, 32{dbg_suffix}") + out_lines.append(f"{indent}{pred_name} = icmp eq i64 {ptr_hi32}, 0{dbg_suffix}") + + out = "\n".join(out_lines) + if ir_text.endswith("\n"): + out += "\n" + + out = re.sub( + r"(?m)^[ \t]*declare\s+i1\s+@llvm\.musa\.isspacep\.shared\s*\(\s*ptr\s*\)\s*(?:#\d+)?\s*\n?", + "", + out, + ) + return out + + +def _rewrite_musa_ptr_gen_to_addrspace(ir_text: str) -> str: + specs = [("global", 1), ("shared", 3)] + ptr_as_map: Dict[str, int] = {} + out_lines = [] + + for line in ir_text.splitlines(): + rewritten = False + for space_name, as_id in specs: + call_re = re.compile( + rf"^([ \t]*)(%[A-Za-z0-9_.]+|%\d+)\s*=\s*(?:tail\s+)?call\s+ptr\s+" + rf"@llvm\.musa\.ptr\.gen\.to\.{space_name}\s*\(\s*ptr(?:\s+[^()%]+)*\s+(%[A-Za-z0-9_.]+|%\d+)\s*\)\s*(,.*)?$" + ) + m = call_re.match(line) + if m is None: + continue + indent, out_ptr, in_ptr, dbg_suffix = m.groups() + dbg_suffix = dbg_suffix or "" + out_lines.append(f"{indent}{out_ptr} = addrspacecast ptr {in_ptr} to ptr addrspace({as_id}){dbg_suffix}") + ptr_as_map[out_ptr] = as_id + rewritten = True + break + if not rewritten: + out_lines.append(line) + + out = "\n".join(out_lines) + if ir_text.endswith("\n"): + out += "\n" + for space_name, _ in specs: + out = re.sub( + rf"(?m)^[ \t]*declare\s+ptr\s+@llvm\.musa\.ptr\.gen\.to\.{space_name}\s*\(\s*ptr\s*\)\s*(?:#\d+)?\s*\n?", + "", + out, + ) + + for ptr_name, as_id in ptr_as_map.items(): + out = re.sub( + rf"\bcmpxchg\s+ptr\s+{re.escape(ptr_name)}\b", + f"cmpxchg ptr addrspace({as_id}) {ptr_name}", + out, + ) + return out + + +def _rewrite_llvm_is_fpclass_f32(ir_text: str) -> str: + call_re = re.compile(r"^([ \t]*)(%[A-Za-z0-9_.]+|%\d+)\s*=\s*(?:tail\s+)?call\s+i1\s+" + r"@llvm\.is\.fpclass\.f32\s*\(\s*float\s+([^,]+)\s*,\s*i32\s+64\s*\)\s*(,.*)?$") + out_lines = [] + changed = False + for line in ir_text.splitlines(): + m = call_re.match(line) + if m is None: + out_lines.append(line) + continue + indent, pred_name, val, dbg_suffix = m.groups() + dbg_suffix = dbg_suffix or "" + out_lines.append(f"{indent}{pred_name} = fcmp oeq float {val.strip()}, 0.000000e+00{dbg_suffix}") + changed = True + + out = "\n".join(out_lines) + if ir_text.endswith("\n"): + out += "\n" + if not changed: + return out + + out = re.sub( + r"(?m)^[ \t]*declare\s+i1\s+@llvm\.is\.fpclass\.f32\s*" + r"\(\s*float\s*,\s*i32\s+immarg\s*\)\s*(?:#\d+)?\s*\n?", + "", + out, + ) + return out + + +def _rewrite_lifetime_intrinsics_for_llvm14(ir_text: str) -> str: + out = ir_text + + out = re.sub( + r"(?m)^([ \t]*(?:tail\s+|musttail\s+|notail\s+)?call\s+void\s+@llvm\.lifetime\.(start|end)\.p0)" + r"\(\s*ptr(\s+[^()%,]+(?:\s+[^()%,]+)*)?\s+([^,)]+)\s*\)", + r"\1(i64 -1, ptr\3 \4)", + out, + ) + + out = re.sub( + r"(?m)^([ \t]*declare\s+void\s+@llvm\.lifetime\.(start|end)\.p0)" + r"\(\s*ptr(\s+[^()%,]+(?:\s+[^()%,]+)*)?\s*\)", + r"\1(i64 immarg, ptr\3)", + out, + ) + + return out + + +_SCMP_UCMP_CALL_RE = re.compile(r"^(\s*)(%\w+)\s*=\s*(?:tail\s+|musttail\s+|notail\s+)?call\s+(?Pi\d+)\s+" + r"@llvm\.(?Pscmp|ucmp)\.(?Pi\d+)\.(?P=opty)\s*" + r"\(\s*(?P=opty)\s+(?P[^,]+)\s*,\s*(?P=opty)\s+(?P[^)]+)\)\s*(?P.*)$") + + +def _rewrite_llvm_scmp_ucmp_to_icmp(ir_text: str) -> str: + pred = {"scmp": ("slt", "sgt"), "ucmp": ("ult", "ugt")} + out_lines: list[str] = [] + counter = 0 + for line in ir_text.splitlines(): + m = _SCMP_UCMP_CALL_RE.match(line) + if not m: + out_lines.append(line) + continue + counter += 1 + indent = m.group(1) + result = m.group(2) + ret_ty = m.group("ret") + kind = m.group("kind") + opty = m.group("opty") + a = m.group("a").strip() + b = m.group("b").strip() + tail = m.group("tail").rstrip() + p_lo, p_hi = pred[kind] + lt = f"%.musa_scmp_lt_{counter}" + gt = f"%.musa_scmp_gt_{counter}" + mid = f"%.musa_scmp_mid_{counter}" + out_lines.append(f"{indent}{lt} = icmp {p_lo} {opty} {a}, {b}") + out_lines.append(f"{indent}{gt} = icmp {p_hi} {opty} {a}, {b}") + out_lines.append(f"{indent}{mid} = select i1 {gt}, {ret_ty} 1, {ret_ty} 0") + last = f"{indent}{result} = select i1 {lt}, {ret_ty} -1, {ret_ty} {mid}" + if tail: + last = f"{last} {tail}" + out_lines.append(last) + + out = "\n".join(out_lines) + if ir_text.endswith("\n"): + out += "\n" + out = re.sub( + r"(?m)^[ \t]*declare\s+i\d+\s+@llvm\.(?:scmp|ucmp)\.i\d+\.i\d+\s*" + r"\(\s*i\d+\s*,\s*i\d+\s*\)[^\n]*\n", + "", + out, + ) + return out + + +def _llvm_compat(ir_text: str) -> str: + replacements = [ + ("memory\\(none\\)", "readnone"), + ("memory\\(read\\)", "readonly"), + ("memory\\(write\\)", "writeonly"), + ("memory\\(argmem: readwrite\\)", "argmemonly"), + ("memory\\(argmem: read\\)", "argmemonly readonly"), + ("memory\\(argmem: write\\)", "argmemonly writeonly"), + ("memory\\(inaccessiblemem: readwrite\\)", "inaccessiblememonly"), + ("memory\\(inaccessiblemem: read\\)", "inaccessiblememonly readonly"), + ("memory\\(inaccessiblemem: write\\)", "inaccessiblememonly writeonly"), + ("memory\\(argmem: readwrite, inaccessiblemem: readwrite\\)", "inaccessiblemem_or_argmemonly"), + ("memory\\(argmem: read, inaccessiblemem: read\\)", "inaccessiblemem_or_argmemonly readonly"), + ("memory\\(argmem: write, inaccessiblemem: write\\)", "inaccessiblemem_or_argmemonly writeonly"), + ] + out = ir_text + for new, old in replacements: + out = re.sub(new, old, out) + + out = re.sub(r"\bicmp\s+samesign\b", "icmp", out) + + splat_re = re.compile(r"<(\d+)\s+x\s+([^>]+)>\s+splat\s*\(\s*\2\s+([^)]+)\)") + + def _expand_splat(match: re.Match) -> str: + count = int(match.group(1)) + ty = match.group(2) + val = match.group(3) + elems = ", ".join([f"{ty} {val}"] * count) + return f"<{count} x {ty}> <{elems}>" + + out = splat_re.sub(_expand_splat, out) + out = _rewrite_bare_splat_operands(out) + out = _strip_range_attributes(out) + out = re.sub(r"\s+captures\(\s*none\s*\)", " nocapture", out) + out = re.sub(r"\s+captures\([^)]*\)", "", out) + out = re.sub(r"\bor\s+disjoint\s+", "or ", out) + out = re.sub(r"\bzext\s+nneg\s+", "zext ", out) + out = re.sub(r"\bsext\s+nneg\s+", "sext ", out) + out = re.sub(r"\buitofp\s+nneg\s+", "uitofp ", out) + out = re.sub(r"\bsitofp\s+nneg\s+", "sitofp ", out) + out = re.sub(r"\btrunc\s+nuw\s+nsw\s+", "trunc ", out) + out = re.sub(r"\btrunc\s+nsw\s+nuw\s+", "trunc ", out) + out = re.sub(r"\btrunc\s+nuw\s+", "trunc ", out) + out = re.sub(r"\btrunc\s+nsw\s+", "trunc ", out) + out = re.sub(r"\bgetelementptr\s+inbounds\s+nusw\s+", "getelementptr inbounds ", out) + out = re.sub(r"\bgetelementptr\s+inbounds\s+nuw\s+", "getelementptr inbounds ", out) + out = re.sub(r"\bgetelementptr\s+inbounds\s+nsw\s+", "getelementptr inbounds ", out) + out = re.sub(r"\bgetelementptr\s+nusw\s+", "getelementptr ", out) + out = re.sub(r"\bgetelementptr\s+nuw\s+", "getelementptr ", out) + out = re.sub(r"\bgetelementptr\s+nsw\s+", "getelementptr ", out) + out = _rewrite_musa_isspacep_shared(out) + out = _rewrite_musa_ptr_gen_to_addrspace(out) + out = _rewrite_llvm_is_fpclass_f32(out) + out = _rewrite_lifetime_intrinsics_for_llvm14(out) + for attr in ("nocallback", "nocreateundeforpoison", "mustprogress", "speculatable", "willreturn"): + out = re.sub(rf"(? str: + for line in ir_text.splitlines(): + if "nvvm.annotations" in line and "\"kernel\"" in line and "@" in line: + m = re.search(r"@([A-Za-z_][A-Za-z0-9_\\.]+)", line) + if m: + return m.group(1) + + matches = re.findall(r"^define\s+[^@]*@([A-Za-z_][A-Za-z0-9_\.]*)", ir_text, flags=re.MULTILINE) + if matches: + return matches[0] + raise RuntimeError("Unable to determine kernel name from LLVM IR") + + +def _llc_extra_options(metadata: Dict[str, object], options: "MUSAOptions") -> list[str]: + uses_mulhi = bool(metadata.get("uses_mulhi_helper")) + const_calc_opt = [] if uses_mulhi else ["-mtgpu-enable-const-calc=1"] + + uses_sqmma = bool(metadata.get("uses_sqmma")) + enable_backend_opt = bool(options.enable_llc_opt or options.enable_backend_opt) + llc_options_map = { + (False, False): [*const_calc_opt], + (True, False): { + *const_calc_opt, + "-mtgpu-alloc-shared-memory-from-zero=1", + }, + (False, True): [ + "-mtgpu-enable-const-calc=1", + "-mtgpu-tiny-offset-hint=1", + "-mtgpu-combine-instr-with-burst=1", + "-mtgpu-combine-fop-instr=1", + ], + (True, True): [ + "-mtgpu-opt-level=1", + "-mtgpu-combine-instr-with-burst=1", + "-mtgpu-combine-fop-instr=1", + "-misched=mtgpu-max-ilp", + ], + } + opts = llc_options_map[(uses_sqmma, enable_backend_opt)] + if options.llc_options: + opts.extend(shlex.split(options.llc_options)) + return opts + + +@dataclass(frozen=True) +class MUSAOptions: + num_warps: int = 4 + num_ctas: int = 1 + num_stages: int = 3 + warp_size: int = 32 + maxnreg: Optional[int] = None + enable_fp_fusion: bool = True + launch_cooperative_grid: bool = False + supported_fp8_dtypes: Tuple[str, ...] = ("fp8e5", ) + supported_fp8_storage_dtypes: Tuple[str, ...] = ("fp8e5", ) + custom_fp8_dtypes: Tuple[str, ...] = () + deprecated_fp8_dot_operand_dtypes: Tuple[str, ...] = () + default_dot_input_precision: str = "ieee" + allowed_dot_input_precisions: Tuple[str, ...] = ("ieee", "tf32", "tf32x3", "bf16x3", "bf16x6") + max_num_imprecise_acc_default: int = 0 + sanitize_overflow: bool = True + toolchain_path: Optional[str] = None + llc_path: Optional[str] = None + lld_path: Optional[str] = None + llc_asm_path: Optional[str] = None + llc_options: Optional[str] = None + enable_llc_opt: bool = False + enable_backend_opt: bool = False + enable_fp8_burst2: bool = False + enable_llvm_compat: bool = True + extern_libs: Optional[tuple] = None + debug: bool = False + backend_name: str = "musa" + supports_noinline: bool = True + arch: Optional[str] = None + instrumentation_mode: str = "" + + def __post_init__(self): + default_libdir = Path(__file__).parent / "lib" + extern_libs = {} if self.extern_libs is None else dict(self.extern_libs) + if not extern_libs.get("libdevice", None): + extern_libs["libdevice"] = knobs.musa.libdevice_path or str(default_libdir / "libdevice.31.bc") + object.__setattr__(self, "extern_libs", tuple(extern_libs.items())) + assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \ + "num_warps must be a power of 2" + + def hash(self): + hash_dict = dict(self.__dict__) + llc_path, lld_path, llc_asm_path = _resolve_toolchain_paths(self) + hash_dict["effective_llc_path"] = llc_path + hash_dict["effective_lld_path"] = lld_path + hash_dict["effective_llc_asm_path"] = llc_asm_path or "" + hash_dict["effective_llc_major"] = _detect_llvm_major_version(llc_path) + hash_dict["llc_tool_signature"] = _tool_version_signature(llc_path) + hash_dict["lld_tool_signature"] = _tool_version_signature(lld_path) + hash_dict["llc_asm_tool_signature"] = _tool_version_signature(llc_asm_path or "") + if hash_dict["extern_libs"]: + hash_dict["extern_libs"] = tuple((k, file_hash(v)) for k, v in sorted(hash_dict["extern_libs"])) + key = "_".join([f"{name}-{val}" for name, val in sorted(hash_dict.items())]) + return hashlib.sha256(key.encode("utf-8")).hexdigest() + + +class MUSABackend(BaseBackend): + + @staticmethod + def supports_target(target: GPUTarget): + return target.backend == "musa" + + def __init__(self, target: GPUTarget) -> None: + super().__init__(target) + self.binary_ext = "mubin" + + def parse_options(self, opts) -> Any: + opts = dict(opts) + arch = knobs.runtime.override_arch or opts.get("arch", None) or self.target.arch + args = {"arch": _normalize_arch(arch)} + capability = _capability_from_arch(args["arch"]) + if opts.get("num_ctas", 1) > 1 and capability == 31: + raise ValueError("num_ctas > 1 requires MUSA cluster launch support. " + f"Current target is {args['arch']} (capability {capability}).") + if "enable_fp_fusion" not in opts: + args["enable_fp_fusion"] = knobs.language.default_fp_fusion + if "supported_fp8_dtypes" not in opts: + supported_fp8_dtypes = {"fp8e5"} + if capability >= 31: + supported_fp8_dtypes.add("fp8e4nv") + args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes)) + if "supported_fp8_storage_dtypes" not in opts: + supported_fp8_storage_dtypes = set(args.get("supported_fp8_dtypes", ())) + if capability >= 31: + supported_fp8_storage_dtypes.update({"fp8e4b15", "fp8e4b8", "fp8e5b16"}) + args["supported_fp8_storage_dtypes"] = tuple(sorted(supported_fp8_storage_dtypes)) + if "custom_fp8_dtypes" not in opts: + custom_fp8_dtypes = set() + if capability >= 31: + custom_fp8_dtypes.update({"fp8e4b15", "fp8e4b8", "fp8e5b16"}) + args["custom_fp8_dtypes"] = tuple(sorted(custom_fp8_dtypes)) + if "deprecated_fp8_dot_operand_dtypes" not in opts: + args["deprecated_fp8_dot_operand_dtypes"] = () + if "toolchain_path" not in opts: + toolchain_path = knobs.musa.toolchain_path + if not toolchain_path: + mtcc_bin_path = os.getenv("MTCC_BIN_PATH") + if mtcc_bin_path: + toolchain_path = mtcc_bin_path + else: + musa_home = os.getenv("MUSA_HOME") + toolchain_path = str(Path(musa_home) / "bin") if musa_home else None + args["toolchain_path"] = _normalize_path(toolchain_path) + if "llc_path" not in opts: + args["llc_path"] = _select_tool_path(knobs.musa.llc_path, knobs.musa.llc) + if "lld_path" not in opts: + args["lld_path"] = _select_tool_path(knobs.musa.lld_path, knobs.musa.lld) + if "llc_asm_path" not in opts: + args["llc_asm_path"] = _normalize_path(knobs.musa.llc_asm_path) + if "llc_options" not in opts: + args["llc_options"] = knobs.musa.llc_options + if "enable_llc_opt" not in opts: + args["enable_llc_opt"] = knobs.musa.enable_llc_opt + if "enable_fp8_burst2" not in opts: + args["enable_fp8_burst2"] = knobs.musa.enable_fp8_burst2 + if "enable_llvm_compat" not in opts: + args["enable_llvm_compat"] = knobs.musa.enable_llvm_compat + args.update({k: opts[k] for k in MUSAOptions.__dataclass_fields__.keys() if k in opts and opts[k] is not None}) + if "warp_size" not in args: + target_warp_size = getattr(self.target, "warp_size", None) + args["warp_size"] = int(target_warp_size) if target_warp_size else 32 + return MUSAOptions(**args) + + def pack_metadata(self, metadata): + return ( + metadata.num_warps, + metadata.num_ctas, + metadata.shared, + ) + + def get_codegen_implementation(self, options): + from triton.language.extra.musa import utils as musa_utils + + return { + "convert_custom_types": musa_utils.convert_custom_float8, + "min_dot_size": min_dot_size(self.target), + } + + def get_module_map(self) -> Dict[str, object]: + try: + from triton.language.extra.musa import libdevice as musa_libdevice # type: ignore + libdevice = musa_libdevice + except Exception: + from triton.language.extra import libdevice + return {"triton.language.extra.libdevice": libdevice} + + def load_dialects(self, ctx): + mthreads.load_dialects(ctx) + + @staticmethod + def make_ttir(mod, metadata, opt): + pm = ir.pass_manager(mod.context) + pm.enable_debug() + passes.common.add_inliner(pm) + passes.ttir.add_rewrite_tensor_pointer(pm) + passes.common.add_canonicalizer(pm) + passes.ttir.add_combine(pm) + passes.ttir.add_reorder_broadcast(pm) + passes.common.add_cse(pm) + passes.common.add_symbol_dce(pm) + passes.ttir.add_loop_unroll(pm) + pm.run(mod, "make_ttir") + return mod + + @staticmethod + def make_ttgir(mod, metadata, opt, arch, capability): + if opt.maxnreg is not None: + mod.set_attr("ttg.maxnreg", ir.builder(mod.context).get_int32_attr(opt.maxnreg)) + + pm = ir.pass_manager(mod.context) + dump_enabled = pm.enable_debug() + emu_tf32 = capability >= 31 + + passes.ttir.add_convert_to_ttgpuir(pm, f"musa:{arch}", opt.num_warps, opt.warp_size, opt.num_ctas) + passes.ttgpuir.add_coalesce(pm) + passes.ttgpuir.add_f32_dot_tc(pm, emu_tf32) + passes.ttgpuir.add_remove_layout_conversions(pm) + passes.ttgpuir.add_optimize_thread_locality(pm) + + mthreads.passes.ttgpuir.add_accelerate_matmul(pm) + passes.ttgpuir.add_remove_layout_conversions(pm) + mthreads.passes.ttgpuir.add_optimize_dot_operands(pm) + mthreads.passes.ttgpuir.add_optimize_descriptor_encoding(pm) + passes.ttir.add_loop_aware_cse(pm) + + if capability >= 31: + passes.ttgpuir.add_fuse_nested_loops(pm) + passes.common.add_canonicalizer(pm) + passes.ttir.add_triton_licm(pm) + passes.common.add_canonicalizer(pm) + mthreads.passes.ttgpuir.add_optimize_accumulator_init(pm) + passes.ttgpuir.add_combine_tensor_select_and_if(pm) + mthreads.passes.ttgpuir.add_optimize_sqmma_accumulator_layout(pm) + passes.ttgpuir.add_assign_latencies(pm, opt.num_stages) + passes.ttgpuir.add_schedule_loops(pm) + mthreads.passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled) + else: + passes.ttir.add_triton_licm(pm) + + passes.common.add_canonicalizer(pm) + passes.ttir.add_loop_aware_cse(pm) + passes.ttgpuir.add_prefetch(pm) + mthreads.passes.ttgpuir.add_optimize_dot_operands(pm) + passes.ttgpuir.add_coalesce_async_copy(pm) + mthreads.passes.ttgpuir.add_tme_lowering(pm) + mthreads.passes.ttgpuir.add_optimize_sqmma_accumulator_layout(pm) + mthreads.passes.ttgpuir.add_canonicalize_sqmma_result_conversions(pm) + passes.ttgpuir.add_remove_layout_conversions(pm) + mthreads.passes.ttgpuir.add_issue_barrier_insertion(pm) + passes.ttgpuir.add_reduce_data_duplication(pm) + passes.ttgpuir.add_reorder_instructions(pm) + mthreads.passes.ttgpuir.add_convert_sqmma_to_mtgpu(pm) + passes.ttir.add_loop_aware_cse(pm) + passes.common.add_symbol_dce(pm) + passes.common.add_sccp(pm) + passes.common.add_cse(pm) + passes.common.add_canonicalizer(pm) + if capability == 31: + mthreads.passes.ttgpuir.add_mark_inplace_loads(pm) + mthreads.passes.ttgpuir.add_finalize_barriers(pm) + pm.run(mod, "make_ttgir") + metadata["uses_sqmma"] = _module_uses_sqmma(mod) + metadata["tensordesc_meta"] = mod.get_tensordesc_metadata() + return mod + + @staticmethod + def make_llir(src, metadata, options, arch): + from triton._C.libtriton import llvm + + mod = src + pm = ir.pass_manager(mod.context) + pm.enable_debug() + + passes.convert.add_scf_to_cf(pm) + passes.convert.add_index_to_llvmir(pm) + mthreads.passes.ttgpuir.add_allocate_shared_memory(pm, _capability_from_arch(arch)) + mthreads.passes.ttgpuir.add_mtgpu_to_llvm(pm, _capability_from_arch(arch)) + mthreads.passes.ttgpuir.add_to_llvmir(pm, _capability_from_arch(arch)) + passes.common.add_canonicalizer(pm) + passes.common.add_cse(pm) + passes.convert.add_cf_to_llvmir(pm) + passes.convert.add_arith_to_llvmir(pm) + passes.common.add_canonicalizer(pm) + passes.common.add_cse(pm) + passes.common.add_symbol_dce(pm) + + if not knobs.compilation.disable_line_info and not knobs.compilation.dump_ir_extract_di_local_variables: + passes.llvmir.add_di_scope(pm) + + pm.run(mod, "make_llir") + + llvm.init_targets() + context = llvm.context() + llvm_mod = llvm.to_module(mod, context) + mthreads.attach_datalayout(llvm_mod) + + if options.extern_libs: + paths = [path for (name, path) in options.extern_libs] + llvm.link_extern_libs(llvm_mod, paths) + + llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3) + maxntidx = max(1, int(options.num_warps) * int(options.warp_size)) + kernel_name_hint = src.get_entry_func_name() if hasattr(src, "get_entry_func_name") else "" + mthreads.decorate_kernel_abi(llvm_mod, kernel_name_hint, maxntidx) + metadata["uses_mulhi_helper"] = mthreads.module_uses_mulhi_helper(llvm_mod) + + metadata["shared"] = src.get_int_attr("ttg.shared") + + ret = str(llvm_mod) + del llvm_mod + del context + return ret + + @staticmethod + def make_mubin(src, metadata, opt, arch): + if not isinstance(src, str): + raise TypeError("Expected LLVM IR as a string for MUSA codegen") + + llc_path, lld_path, llc_asm_path = _resolve_toolchain_paths(opt) + if not llc_path or not lld_path: + raise RuntimeError("MUSA toolchain not configured. Set TRITON_MUSA_TOOLCHAIN_PATH " + "or TRITON_MUSA_LLC_PATH/TRITON_MUSA_LLD_PATH (or MUSA_HOME).") + + ir_text = src + llc_major = _detect_llvm_major_version(llc_path) + if opt.enable_llvm_compat: + if _should_apply_llvm_compat(llc_major): + ir_text = _llvm_compat(ir_text) + ir_text = _rewrite_llvm_scmp_ucmp_to_icmp(ir_text) + + if knobs.musa.dump_llir: + print("// -----// MUSA LLVMIR Dump //----- //") + print(ir_text) + + capability = _capability_from_arch(arch) + llc_opt_level = "-O2" + llc_opts = [ + "-march=mtgpu", + f"-mcpu=mp_{capability}", + *_llc_opaque_pointer_options(llc_major), + llc_opt_level, + "-filetype=obj", + ] + llc_opts.extend(_llc_extra_options(metadata, opt)) + + tmp_dir = tempfile.mkdtemp(prefix="triton-musa-") + tmp_path = Path(tmp_dir) + keep_artifacts = True + try: + tmp_path = Path(tmp_dir) + ll_file = tmp_path / "kernel.ll" + obj_file = tmp_path / "kernel.o" + mubin_file = tmp_path / "kernel.mubin" + + ll_file.write_text(ir_text) + + replace_llir = knobs.musa.replace_llir + if replace_llir and Path(replace_llir).exists(): + ll_file = Path(replace_llir) + kernel_name = _extract_kernel_name(ll_file.read_text()) + + if llc_asm_path: + llc_asm_major = _detect_llvm_major_version(llc_asm_path) + asm_file = tmp_path / "kernel.s" + asm_cmd = [ + llc_asm_path, + str(ll_file), + "-march=mtgpu", + f"-mcpu=mp_{capability}", + *_llc_opaque_pointer_options(llc_asm_major), + llc_opt_level, + "-filetype=asm", + "-o", + str(asm_file), + ] + asm_cmd.extend(_llc_extra_options(metadata, opt)) + _run_tool_command( + "llc-asm", + asm_cmd, + repro_dir=tmp_path, + dump_log=knobs.musa.dump_toolchain_log, + ) + if knobs.musa.dump_muasm: + print("// -----// MUASM Dump //----- //") + print(asm_file.read_text()) + + llc_cmd = [llc_path, str(ll_file), *llc_opts, "-o", str(obj_file)] + _run_tool_command( + "llc", + llc_cmd, + repro_dir=tmp_path, + dump_log=knobs.musa.dump_toolchain_log, + ) + + lld_cmd = [lld_path, "-flavor", "gnu", "-shared", str(obj_file), "-o", str(mubin_file)] + _run_tool_command( + "ld.lld", + lld_cmd, + repro_dir=tmp_path, + dump_log=knobs.musa.dump_toolchain_log, + ) + + replace_mubin = knobs.musa.replace_mubin + if replace_mubin and Path(replace_mubin).exists(): + mubin_file = Path(replace_mubin) + + metadata["name"] = kernel_name + result = mubin_file.read_bytes() + keep_artifacts = False + return result + finally: + if not keep_artifacts: + shutil.rmtree(tmp_dir, ignore_errors=True) + + def add_stages(self, stages, options, language): + arch = options.arch + capability = _capability_from_arch(arch) + if language == Language.TRITON: + stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) + stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, arch, capability) + elif language == Language.GLUON: + raise RuntimeError("MUSA backend does not support GLUON yet") + stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, arch) + stages["mubin"] = lambda src, metadata: self.make_mubin(src, metadata, options, arch) + if knobs.runtime.add_stages_inspection_hook is not None: + knobs.runtime.add_stages_inspection_hook(self, stages, options, language, arch) + + @functools.lru_cache() + def hash(self): + version = get_musa_version() + return f"{version}-{self.target.arch}" diff --git a/third_party/mthreads/backend/driver.c b/third_party/mthreads/backend/driver.c new file mode 100644 index 0000000000..6b9bcc6c4d --- /dev/null +++ b/third_party/mthreads/backend/driver.c @@ -0,0 +1,349 @@ +#include "musa.h" +#include +#include +#include +#include +#define PY_SSIZE_T_CLEAN +#include + +// Raises a Python exception and returns false if code is not MUSA_SUCCESS. +static bool gpuAssert(MUresult code, const char *file, int line) { + if (code == MUSA_SUCCESS) + return true; + + const char *prefix = "Triton Error [MUSA]: "; + const char *str; + muGetErrorString(code, &str); + char err[1024] = {0}; + strcat(err, prefix); + strcat(err, str); + PyGILState_STATE gil_state; + gil_state = PyGILState_Ensure(); + PyErr_SetString(PyExc_RuntimeError, err); + PyGILState_Release(gil_state); + return false; +} + +// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block. +#define MUSA_CHECK_AND_RETURN_NULL(ans) \ + do { \ + if (!gpuAssert((ans), __FILE__, __LINE__)) \ + return NULL; \ + } while (0) + +// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block. +#define MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \ + do { \ + if (!gpuAssert((ans), __FILE__, __LINE__)) { \ + PyEval_RestoreThread(_save); \ + return NULL; \ + } \ + } while (0) + +static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { + int device_id; + if (!PyArg_ParseTuple(args, "i", &device_id)) + return NULL; + MUdevice device; + muDeviceGet(&device, device_id); + + int max_shared_mem; + int max_num_regs; + int multiprocessor_count; + int warp_size; + int sm_clock_rate; + int mem_clock_rate; + int mem_bus_width; + MUSA_CHECK_AND_RETURN_NULL(muDeviceGetAttribute( + &max_shared_mem, MU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, + device)); + MUSA_CHECK_AND_RETURN_NULL(muDeviceGetAttribute( + &max_num_regs, MU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); + MUSA_CHECK_AND_RETURN_NULL(muDeviceGetAttribute( + &multiprocessor_count, MU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + MUSA_CHECK_AND_RETURN_NULL( + muDeviceGetAttribute(&warp_size, MU_DEVICE_ATTRIBUTE_WARP_SIZE, device)); + MUSA_CHECK_AND_RETURN_NULL(muDeviceGetAttribute( + &sm_clock_rate, MU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); + MUSA_CHECK_AND_RETURN_NULL(muDeviceGetAttribute( + &mem_clock_rate, MU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); + MUSA_CHECK_AND_RETURN_NULL(muDeviceGetAttribute( + &mem_bus_width, MU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); + + return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", + max_shared_mem, "max_num_regs", max_num_regs, + "multiprocessor_count", multiprocessor_count, "warpSize", + warp_size, "sm_clock_rate", sm_clock_rate, + "mem_clock_rate", mem_clock_rate, "mem_bus_width", + mem_bus_width); +} + +static PyObject *loadBinary(PyObject *self, PyObject *args) { + const char *name; + const char *data; + Py_ssize_t data_size; + int shared; + int device; + if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, + &device)) { + return NULL; + } + if (data_size == 0) { + PyErr_SetString(PyExc_RuntimeError, + "Empty MUSA binary: codegen is not available yet."); + return NULL; + } + MUfunction fun; + MUmodule mod; + int32_t n_regs = 0; + int32_t n_spills = 0; + int32_t n_max_threads = 0; + MUcontext pctx = 0; + + Py_BEGIN_ALLOW_THREADS; + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muCtxGetCurrent(&pctx)); + if (!pctx) { + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muDevicePrimaryCtxRetain(&pctx, device)); + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muCtxSetCurrent(pctx)); + } + + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muModuleLoadData(&mod, data)); + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muModuleGetFunction(&fun, mod, name)); + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muFuncGetAttribute(&n_regs, MU_FUNC_ATTRIBUTE_NUM_REGS, fun)); + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muFuncGetAttribute(&n_spills, MU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); + n_spills /= 4; + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muFuncGetAttribute( + &n_max_threads, MU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun)); + + int shared_optin = 0; + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muDeviceGetAttribute( + &shared_optin, MU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, + device)); + + int shared_static = 0; + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muFuncGetAttribute( + &shared_static, MU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); + int max_dynamic_shared = shared_optin - shared_static; + if (max_dynamic_shared < 0) + max_dynamic_shared = 0; + int requested_dynamic_shared = shared; + if (requested_dynamic_shared > max_dynamic_shared) + requested_dynamic_shared = max_dynamic_shared; + if (requested_dynamic_shared > 0) { + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muFuncSetAttribute(fun, MU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + requested_dynamic_shared)); + } + Py_END_ALLOW_THREADS; + + if (PyErr_Occurred()) { + return NULL; + } + return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs, + n_spills, n_max_threads); +} + +static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) { + long size; + if (!PyArg_ParseTuple(args, "l", &size)) { + return NULL; + } + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative"); + return NULL; + } + + Py_BEGIN_ALLOW_THREADS; + + MUcontext ctx = NULL; + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muCtxGetCurrent(&ctx)); + if (!ctx) { + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muDevicePrimaryCtxRetain(&ctx, /*device=*/0)); + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(muCtxSetCurrent(ctx)); + } + + size_t oldSize = 0; + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muCtxGetLimit(&oldSize, MU_LIMIT_PRINTF_FIFO_SIZE)); + if (oldSize != (size_t)size) { + MUSA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( + muCtxSetLimit(MU_LIMIT_PRINTF_FIFO_SIZE, size)); + } + + Py_END_ALLOW_THREADS; + Py_INCREF(Py_None); + return Py_None; +} + +static bool getTensorDescriptorDataType(int elementSize, + MUtensorDescriptorDataType *type) { + switch (elementSize) { + case 1: + *type = MU_TENSOR_DESCRIPTOR_DATA_TYPE_UINT8; + return true; + case 2: + *type = MU_TENSOR_DESCRIPTOR_DATA_TYPE_UINT16; + return true; + case 4: + *type = MU_TENSOR_DESCRIPTOR_DATA_TYPE_UINT32; + return true; + default: + PyErr_SetString(PyExc_ValueError, "element_size must be 1, 2, or 4 bytes"); + return false; + } +} + +static bool validateTMEDescriptorBlockBytes(unsigned rank, + const uint32_t *block_dims, + int element_size) { + uint64_t block_bytes = (uint64_t)element_size; + for (unsigned i = 0; i < rank; ++i) + block_bytes *= (uint64_t)block_dims[i]; + if (block_bytes >= 32) + return true; + + char err[64] = {0}; + snprintf(err, sizeof(err), "%uD block bytes must be >= 32", rank); + PyErr_SetString(PyExc_ValueError, err); + return false; +} + +static PyObject * +fillTMEDescriptorImpl(unsigned rank, unsigned long long global_address, + const uint64_t *dims, const uint32_t *block_dims, + int element_size, unsigned long long desc_address) { + MUtensorDescriptorDataType type; + if (!getTensorDescriptorDataType(element_size, &type)) + return NULL; + if (!validateTMEDescriptorBlockBytes(rank, block_dims, element_size)) + return NULL; + + uint64_t global_strides[5] = {0}; + global_strides[0] = dims[0] * (uint64_t)element_size; + for (unsigned i = 1; i < rank; ++i) + global_strides[i] = global_strides[i - 1] * dims[i]; + + MUtensorDescriptor desc; + MUSA_CHECK_AND_RETURN_NULL(muTensorDescriptorEncode( + &desc, type, /*tensorRank=*/rank, (void *)global_address, dims, + global_strides, MU_TENSOR_DESCRIPTOR_INTERLEAVE_NONE, /*swizzle=*/0)); + MUSA_CHECK_AND_RETURN_NULL( + muMemcpyHtoD((MUdeviceptr)desc_address, &desc, sizeof(desc))); + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *fill1DTMEDescriptor(PyObject *self, PyObject *args) { + unsigned long long global_address = 0; + uint64_t dims[1]; + uint32_t block_dims[1]; + int element_size = 0; + unsigned long long desc_address = 0; + if (!PyArg_ParseTuple(args, "KKiiK", &global_address, &dims[0], + &block_dims[0], &element_size, &desc_address)) + return NULL; + + return fillTMEDescriptorImpl(/*rank=*/1, global_address, dims, block_dims, + element_size, desc_address); +} + +static PyObject *fill2DTMEDescriptor(PyObject *self, PyObject *args) { + unsigned long long global_address = 0; + uint64_t dims[2]; + uint32_t block_dims[2]; + int element_size = 0; + unsigned long long desc_address = 0; + if (!PyArg_ParseTuple(args, "KKKiiiK", &global_address, &dims[1], &dims[0], + &block_dims[1], &block_dims[0], &element_size, + &desc_address)) + return NULL; + + return fillTMEDescriptorImpl(/*rank=*/2, global_address, dims, block_dims, + element_size, desc_address); +} + +static PyObject *fill3DTMEDescriptor(PyObject *self, PyObject *args) { + unsigned long long global_address = 0; + uint64_t dims[3]; + uint32_t block_dims[3]; + int element_size = 0; + unsigned long long desc_address = 0; + if (!PyArg_ParseTuple(args, "KKKKiiiiK", &global_address, &dims[2], &dims[1], + &dims[0], &block_dims[2], &block_dims[1], + &block_dims[0], &element_size, &desc_address)) + return NULL; + + return fillTMEDescriptorImpl(/*rank=*/3, global_address, dims, block_dims, + element_size, desc_address); +} + +static PyObject *fill4DTMEDescriptor(PyObject *self, PyObject *args) { + unsigned long long global_address = 0; + uint64_t dims[4]; + uint32_t block_dims[4]; + int element_size = 0; + unsigned long long desc_address = 0; + if (!PyArg_ParseTuple(args, "KKKKKiiiiiK", &global_address, &dims[3], + &dims[2], &dims[1], &dims[0], &block_dims[3], + &block_dims[2], &block_dims[1], &block_dims[0], + &element_size, &desc_address)) + return NULL; + + return fillTMEDescriptorImpl(/*rank=*/4, global_address, dims, block_dims, + element_size, desc_address); +} + +static PyObject *fill5DTMEDescriptor(PyObject *self, PyObject *args) { + unsigned long long global_address = 0; + uint64_t dims[5]; + uint32_t block_dims[5]; + int element_size = 0; + unsigned long long desc_address = 0; + if (!PyArg_ParseTuple(args, "KKKKKKiiiiiiK", &global_address, &dims[4], + &dims[3], &dims[2], &dims[1], &dims[0], &block_dims[4], + &block_dims[3], &block_dims[2], &block_dims[1], + &block_dims[0], &element_size, &desc_address)) + return NULL; + + return fillTMEDescriptorImpl(/*rank=*/5, global_address, dims, block_dims, + element_size, desc_address); +} + +static PyMethodDef ModuleMethods[] = { + {"load_binary", loadBinary, METH_VARARGS, + "Load provided mubin into MUSA driver"}, + {"get_device_properties", getDeviceProperties, METH_VARARGS, + "Get the properties for a given device"}, + {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS, + "Set printf FIFO size"}, + {"fill_1d_tma_descriptor", fill1DTMEDescriptor, METH_VARARGS, + "Fill a 1D TMA descriptor"}, + {"fill_2d_tma_descriptor", fill2DTMEDescriptor, METH_VARARGS, + "Fill a 2D TMA descriptor"}, + {"fill_3d_tma_descriptor", fill3DTMEDescriptor, METH_VARARGS, + "Fill a 3D TMA descriptor"}, + {"fill_4d_tma_descriptor", fill4DTMEDescriptor, METH_VARARGS, + "Fill a 4D TMA descriptor"}, + {"fill_5d_tma_descriptor", fill5DTMEDescriptor, METH_VARARGS, + "Fill a 5D TMA descriptor"}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "musa_utils", + NULL, // documentation + -1, // size + ModuleMethods}; + +PyMODINIT_FUNC PyInit_musa_utils(void) { + PyObject *m = PyModule_Create(&ModuleDef); + if (m == NULL) { + return NULL; + } + PyModule_AddFunctions(m, ModuleMethods); + return m; +} diff --git a/third_party/mthreads/backend/driver.py b/third_party/mthreads/backend/driver.py new file mode 100644 index 0000000000..f5e496a0ac --- /dev/null +++ b/third_party/mthreads/backend/driver.py @@ -0,0 +1,895 @@ +import functools +import os +import subprocess +import weakref +from collections import OrderedDict +from pathlib import Path + +from triton import knobs +from triton.backends.compiler import GPUTarget +from triton.backends.driver import DriverBase +from triton.runtime.build import compile_module_from_src + +dirname = os.path.dirname(os.path.realpath(__file__)) +_TENSORDESC_CACHE_LIMIT = 1024 + + +def _split_paths(value: str): + return [p for p in value.split(":") if p] + + +@functools.lru_cache() +def _musa_home_dirs(): + candidates = [] + for key in ("MUSA_HOME", "MUSA_ROOT"): + if val := os.getenv(key): + candidates.append(val) + return candidates + + +@functools.lru_cache() +def _musa_include_dirs(): + include_dirs = [os.path.join(dirname, "include")] + if env_inc := os.getenv("TRITON_MUSA_INCLUDE_PATH"): + include_dirs.append(env_inc) + for home in _musa_home_dirs(): + include_dirs.append(os.path.join(home, "include")) + + # Validate that musa.h exists in one of the include dirs. + for inc in include_dirs: + if os.path.exists(os.path.join(inc, "musa.h")): + return include_dirs + raise RuntimeError("Cannot find musa.h. Set TRITON_MUSA_INCLUDE_PATH or MUSA_HOME/MUSA_ROOT to a valid MUSA SDK.") + + +@functools.lru_cache() +def _libmusa_dirs(): + + def has_libmusa(path: str) -> bool: + return (os.path.exists(os.path.join(path, "libmusa.so")) or os.path.exists(os.path.join(path, "libmusa.so.1"))) + + paths = [] + + if env_lib := os.getenv("TRITON_LIBMUSA_PATH") or os.getenv("TRITON_MUSA_LIB_PATH"): + if os.path.isfile(env_lib): + paths.append(os.path.dirname(env_lib)) + else: + paths.append(env_lib) + + for home in _musa_home_dirs(): + paths.append(os.path.join(home, "lib")) + paths.append(os.path.join(home, "lib64")) + + env_ld = os.getenv("LD_LIBRARY_PATH") + if env_ld: + paths.extend(_split_paths(env_ld)) + + # Try ldconfig cache + try: + libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode(errors="ignore") + locs = [line.split()[-1] for line in libs.splitlines() if "libmusa.so" in line] + paths.extend([os.path.dirname(loc) for loc in locs]) + except Exception: + pass + + # Filter to existing directories that contain libmusa. + valid = [p for p in paths if has_libmusa(p)] + if not valid: + raise RuntimeError( + "libmusa.so/libmusa.so.1 not found. Set TRITON_LIBMUSA_PATH/TRITON_MUSA_LIB_PATH or MUSA_HOME/MUSA_ROOT, " + "or update LD_LIBRARY_PATH.") + return valid + + +def _library_dirs(): + return [os.path.join(dirname, "lib"), *_libmusa_dirs()] + + +# ------------------------ +# Utils +# ------------------------ + + +class MusaUtils(object): + + def __new__(cls): + if not hasattr(cls, "instance"): + cls.instance = super(MusaUtils, cls).__new__(cls) + return cls.instance + + def __init__(self): + src = Path(os.path.join(dirname, "driver.c")).read_text() + mod = compile_module_from_src( + src=src, + name="musa_utils", + include_dirs=_musa_include_dirs(), + library_dirs=_library_dirs(), + libraries=["musa"], + ) + self.load_binary = mod.load_binary + self.get_device_properties = mod.get_device_properties + self.set_printf_fifo_size = mod.set_printf_fifo_size + for name in ("fill_1d_tma_descriptor", "fill_2d_tma_descriptor", "fill_3d_tma_descriptor", + "fill_4d_tma_descriptor", "fill_5d_tma_descriptor"): + if hasattr(mod, name): + setattr(self, name, getattr(mod, name)) + + +# ------------------------ +# Launcher +# ------------------------ + + +def ty_to_cpp(ty): + # Align ABI mapping with NVIDIA/AMD for host-side signatures. + if ty[0] == '*': + return "MUdeviceptr" + if ty.startswith("tensordesc<"): + return "MUdeviceptr" + return { + "i1": "int32_t", + "i8": "int8_t", + "i16": "int16_t", + "i32": "int32_t", + "i64": "int64_t", + "u1": "uint32_t", + "u8": "uint8_t", + "u16": "uint16_t", + "u32": "uint32_t", + "u64": "uint64_t", + "fp16": "double", + "bf16": "double", + "fp32": "double", + "f32": "double", + "fp64": "double", + "constexpr": "int64_t", + }[ty] + + +def ty_to_cpp_param(ty): + if ty[0] == '*': + return "MUdeviceptr" + if ty.startswith("tensordesc<"): + return "MUdeviceptr" + return { + "i1": "int32_t", + "i8": "int8_t", + "i16": "int16_t", + "i32": "int32_t", + "i64": "int64_t", + "u1": "uint32_t", + "u8": "uint8_t", + "u16": "uint16_t", + "u32": "uint32_t", + "u64": "uint64_t", + "fp16": "uint16_t", + "bf16": "uint16_t", + "fp32": "float", + "f32": "float", + "fp64": "double", + "constexpr": "int64_t", + }[ty] + + +def _parse_tensordesc_type(ty: str): + if not isinstance(ty, str) or not ty.startswith("tensordesc<") or not ty.endswith(">"): + return None + body = ty[len("tensordesc<"):-1] + dtype, sep, shape = body.partition("[") + if not sep or not shape.endswith("]"): + return None + dims = [dim.strip() for dim in shape[:-1].split(",") if dim.strip()] + if not dtype or not dims: + return None + return dtype.strip(), len(dims) + + +def _get_tensordesc_abi_expanded_args(rank: int, metadata): + full_abi_args = 1 + 2 * rank + if metadata is None: + return full_abi_args + abi_args = int(metadata.get("abi_expanded_args", full_abi_args)) + if abi_args not in (1, full_abi_args): + raise ValueError( + f"unsupported MUSA tensor descriptor ABI expansion: expected 1 or {full_abi_args}, got {abi_args}") + return abi_args + + +def _expand_tensordesc_signature(signature_types, tensordesc_meta=None): + expanded_types = [] + expanded_index = {} + tensordesc_idx = 0 + for i, ty in enumerate(signature_types): + desc_info = _parse_tensordesc_type(ty) + if desc_info is None: + expanded_index[i] = [len(expanded_types)] + expanded_types.append(ty) + continue + + dtype, rank = desc_info + desc_meta = None + if tensordesc_meta is not None and tensordesc_idx < len(tensordesc_meta): + desc_meta = tensordesc_meta[tensordesc_idx] + abi_expanded_args = _get_tensordesc_abi_expanded_args(rank, desc_meta) + mapped = [] + expanded_types.append(f"*{dtype}") + mapped.append(len(expanded_types) - 1) + if abi_expanded_args != 1: + for _ in range(rank): + expanded_types.append("i32") + mapped.append(len(expanded_types) - 1) + for _ in range(rank): + expanded_types.append("i64") + mapped.append(len(expanded_types) - 1) + expanded_index[i] = mapped + tensordesc_idx += 1 + + return expanded_types, expanded_index + + +def _normalize_arg_path(key): + if isinstance(key, int): + return (key, ) + if isinstance(key, tuple): + return key + raise TypeError(f"unsupported signature path key: {key!r}") + + +def _expand_signature_tree(signature_types, tensordesc_meta=None): + expanded_types = [] + expanded_index = {} + tensordesc_idx = 0 + + def visit(ty, path): + nonlocal tensordesc_idx + + if isinstance(ty, tuple): + mapped = [] + for child_idx, child_ty in enumerate(ty): + mapped.extend(visit(child_ty, path + (child_idx, ))) + expanded_index[path] = mapped + return mapped + + desc_info = _parse_tensordesc_type(ty) + if desc_info is None: + expanded_index[path] = [len(expanded_types)] + expanded_types.append(ty) + return expanded_index[path] + + _, rank = desc_info + desc_meta = None + if tensordesc_meta is not None and tensordesc_idx < len(tensordesc_meta): + desc_meta = tensordesc_meta[tensordesc_idx] + abi_expanded_args = _get_tensordesc_abi_expanded_args(rank, desc_meta) + mapped = [] + dtype = desc_info[0] + expanded_types.append(f"*{dtype}") + mapped.append(len(expanded_types) - 1) + if abi_expanded_args != 1: + for _ in range(rank): + expanded_types.append("i32") + mapped.append(len(expanded_types) - 1) + for _ in range(rank): + expanded_types.append("i64") + mapped.append(len(expanded_types) - 1) + expanded_index[path] = mapped + tensordesc_idx += 1 + return mapped + + for top_idx, ty in enumerate(signature_types): + visit(ty, (top_idx, )) + + return expanded_types, expanded_index + + +def _expand_tensordesc_kernel_arg(arg, rank: int, metadata): + if not (hasattr(arg, "base") and hasattr(arg, "shape") and hasattr(arg, "strides")): + raise TypeError("tensor descriptor argument must provide base/shape/strides") + shape = [int(v) for v in arg.shape] + strides = [int(v) for v in arg.strides] + if len(shape) != rank or len(strides) != rank: + raise ValueError( + f"tensor descriptor rank mismatch: expected {rank}, got shape={len(shape)} strides={len(strides)}") + + if metadata is not None and "block_size" in metadata: + block_shape = [int(v) for v in metadata["block_size"]] + else: + block_shape = [int(v) for v in getattr(arg, "block_shape", ())] + if len(block_shape) != rank: + raise ValueError(f"tensor descriptor block rank mismatch: expected {rank}, got {len(block_shape)}") + + if metadata is not None and "elem_size" in metadata: + elem_size = int(metadata["elem_size"]) + elif hasattr(arg.base, "element_size"): + elem_size = int(arg.base.element_size()) + else: + raise TypeError("cannot infer tensor descriptor element size") + + import torch + import triton + + descriptor = torch.empty((64, ), dtype=torch.uint8, device=arg.base.device) + fill_name = f"fill_{rank}d_tma_descriptor" + fill_fn = getattr(triton.runtime.driver.active.utils, fill_name, None) + if fill_fn is None: + raise RuntimeError(f"musa driver utils missing {fill_name}") + + if rank > 5: + raise RuntimeError(f"MUSA tensor descriptor rank {rank} is unsupported in launcher") + fill_fn(arg.base.data_ptr(), *shape, *block_shape, elem_size, descriptor.data_ptr()) + + if hasattr(torch, "musa"): + torch.musa.synchronize() + + abi_expanded_args = _get_tensordesc_abi_expanded_args(rank, metadata) + if abi_expanded_args == 1: + return [descriptor], descriptor + return [descriptor, *shape, *strides], descriptor + + +def _make_tensordesc_cache_key(arg, rank: int, metadata): + base = getattr(arg, "base", None) + if base is None or not hasattr(base, "data_ptr"): + return None + + device = getattr(base, "device", None) + device_type = getattr(device, "type", None) + device_index = getattr(device, "index", None) + + try: + shape = tuple(int(v) for v in arg.shape) + strides = tuple(int(v) for v in arg.strides) + except Exception: + return None + + if metadata is not None and "block_size" in metadata: + block_shape = tuple(int(v) for v in metadata["block_size"]) + else: + try: + block_shape = tuple(int(v) for v in getattr(arg, "block_shape", ())) + except Exception: + return None + + if metadata is not None and "elem_size" in metadata: + elem_size = int(metadata["elem_size"]) + elif hasattr(base, "element_size"): + elem_size = int(base.element_size()) + else: + return None + abi_expanded_args = _get_tensordesc_abi_expanded_args(rank, metadata) + + return ( + int(base.data_ptr()), + device_type, + device_index, + shape, + strides, + block_shape, + elem_size, + int(rank), + abi_expanded_args, + ) + + +def make_launcher(constants, signature, ids, warp_size): + params = [i for i, ty in signature.items() if ty != "constexpr" and i not in constants] + arg_decls = ', '.join(f"{ty_to_cpp_param(signature[i])} arg{i}" for i in params) + + def _parse_type(ty): + if ty[0] == '*': + return "PyObject*" + if ty == "constexpr": + # 3.5 runtime forwards constexpr Python objects in launch args. + # They are compile-time only and should not be interpreted as C scalars. + return "PyObject*" + if ty in ("fp16", "bf16", "fp32", "f32", "fp64"): + return "double" + return ty_to_cpp_param(ty) + + def format_of(ty): + return { + "PyObject*": "O", + "float": "f", + "double": "d", + "long": "l", + "int8_t": "b", + "int16_t": "h", + "int32_t": "i", + "int64_t": "L", + "uint8_t": "B", + "uint16_t": "H", + "uint32_t": "I", + "uint64_t": "K", + }[ty] + + args_format = ''.join([format_of(_parse_type(ty)) for ty in signature.values()]) + format = "iiiKKOOOO" + args_format + args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else '' + + packed_decls = [] + packed_inits = [] + launch_args = [] + for i in params: + ty = signature[i] + if ty[0] == "*": + launch_args.append(f"ptr_info{i}.dev_ptr") + continue + if ty == "fp16": + packed_decls.append(f" uint16_t arg{i};") + packed_inits.append(f" arg{i} = pack_fp16(_arg{i});") + launch_args.append(f"arg{i}") + elif ty == "bf16": + packed_decls.append(f" uint16_t arg{i};") + packed_inits.append(f" arg{i} = pack_bf16(_arg{i});") + launch_args.append(f"arg{i}") + elif ty in ("fp32", "f32"): + packed_decls.append(f" float arg{i} = (float)_arg{i};") + launch_args.append(f"arg{i}") + else: + launch_args.append(f"_arg{i}") + + packed_decls_src = "\n".join(packed_decls) + packed_inits_src = "\n".join(packed_inits) + + src = f""" +#include \"musa.h\" +#include +#include +#include +#include + +static inline uint16_t pack_fp16(double val) {{ + uint16_t result; +#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && \\ + !defined(PYPY_VERSION) + _PyFloat_Pack2(val, (unsigned char *)&result, 1); +#else + PyFloat_Pack2(val, (char *)&result, 1); +#endif + return result; +}} + +static inline uint16_t pack_bf16(double val) {{ + float f32 = (float)val; + uint32_t u32 = *(uint32_t *)&f32; + return (uint16_t)(u32 >> 16); +}} + +static inline void gpuAssert(MUresult code, const char *file, int line) +{{ + if (code != MUSA_SUCCESS) + {{ + const char* prefix = \"Triton Error [MUSA]: \"; + const char* str; + muGetErrorString(code, &str); + char err[1024] = {{0}}; + strcat(err, prefix); + strcat(err, str); + PyGILState_STATE gil_state; + gil_state = PyGILState_Ensure(); + PyErr_SetString(PyExc_RuntimeError, err); + PyGILState_Release(gil_state); + }} +}} + +#define MUSA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} + +typedef MUresult (*muLaunchKernelEx_t)(const MUlaunchConfig *config, MUfunction f, void **kernelParams, void **extra); + +static muLaunchKernelEx_t getLaunchKernelExHandle() {{ + void* handle = dlopen(\"libmusa.so\", RTLD_LAZY); + if (!handle) {{ + handle = dlopen(\"libmusa.so.1\", RTLD_LAZY); + }} + if (!handle) {{ + PyErr_SetString(PyExc_RuntimeError, \"Failed to open libmusa.so or libmusa.so.1\"); + return NULL; + }} + dlerror(); + muLaunchKernelEx_t muLaunchKernelExHandle = (muLaunchKernelEx_t)dlsym(handle, \"muLaunchKernelEx\"); + const char *dlsym_error = dlerror(); + if (dlsym_error) {{ + PyErr_SetString(PyExc_RuntimeError, \"Failed to retrieve muLaunchKernelEx from libmusa\"); + return NULL; + }} + return muLaunchKernelExHandle; +}} + +static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int shared_memory, MUstream stream, MUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{ + MUdeviceptr global_scratch_ptr = 0; + MUdeviceptr profile_scratch_ptr = 0; + void *params[] = {{ {', '.join([*(f"&arg{i}" for i in params), "&global_scratch_ptr", "&profile_scratch_ptr"]) } }}; + if (gridX*gridY*gridZ > 0) {{ + if (num_ctas == 1) {{ + MUSA_CHECK(muLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0)); + }} else {{ + MUlaunchAttribute launchAttr[2]; + launchAttr[0].id = MU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + launchAttr[0].value.clusterDim.x = num_ctas; + launchAttr[0].value.clusterDim.y = 1; + launchAttr[0].value.clusterDim.z = 1; + launchAttr[1].id = MU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; + launchAttr[1].value.clusterSchedulingPolicyPreference = MU_CLUSTER_SCHEDULING_POLICY_SPREAD; + MUlaunchConfig config; + config.gridDimX = gridX * num_ctas; + config.gridDimY = gridY; + config.gridDimZ = gridZ; + config.blockDimX = {warp_size} * num_warps; + config.blockDimY = 1; + config.blockDimZ = 1; + config.sharedMemBytes = shared_memory; + config.hStream = stream; + config.attrs = launchAttr; + config.numAttrs = 2; + static muLaunchKernelEx_t muLaunchKernelExHandle = NULL; + if (muLaunchKernelExHandle == NULL) {{ + muLaunchKernelExHandle = getLaunchKernelExHandle(); + }} + MUSA_CHECK(muLaunchKernelExHandle(&config, function, params, 0)); + }} + }} +}} + +typedef struct _DevicePtrInfo {{ + MUdeviceptr dev_ptr; + bool valid; +}} DevicePtrInfo; + +static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{ + DevicePtrInfo ptr_info; + ptr_info.dev_ptr = 0; + ptr_info.valid = true; + if (PyLong_Check(obj)) {{ + ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj); + return ptr_info; + }} + if (obj == Py_None) {{ + return ptr_info; + }} + PyObject *ptr = PyObject_GetAttrString(obj, \"data_ptr\"); + if(ptr){{ + PyObject *empty_tuple = PyTuple_New(0); + PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); + Py_DECREF(empty_tuple); + Py_DECREF(ptr); + if (!PyLong_Check(ret)) {{ + PyErr_SetString(PyExc_TypeError, \"data_ptr method of Pointer object must return 64-bit int\"); + ptr_info.valid = false; + Py_DECREF(ret); + return ptr_info; + }} + ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret); + if(!ptr_info.dev_ptr) + return ptr_info; + uint64_t dev_ptr; + int status = muPointerGetAttribute(&dev_ptr, MU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); + if (status == MUSA_ERROR_INVALID_VALUE) {{ + PyErr_Format(PyExc_ValueError, + \"Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)\", idx); + ptr_info.valid = false; + }} else if (status != MUSA_SUCCESS) {{ + MUSA_CHECK((MUresult)status); // Catch any other musa API errors + ptr_info.valid = false; + }} + ptr_info.dev_ptr = dev_ptr; + Py_DECREF(ret); + return ptr_info; + }} + PyErr_SetString(PyExc_TypeError, \"Pointer argument must be either uint64 or have data_ptr method\"); + ptr_info.valid = false; + return ptr_info; +}} + +static void ensureMusaContext() {{ + MUcontext pctx; + MUSA_CHECK(muCtxGetCurrent(&pctx)); + if (!pctx) {{ + // Ensure device context. + MUdevice device; + MUSA_CHECK(muDeviceGet(&device, 0)); + MUSA_CHECK(muDevicePrimaryCtxRetain(&pctx, device)); + MUSA_CHECK(muCtxSetCurrent(pctx)); + }} +}} + +static PyObject* launch(PyObject* self, PyObject* args) {{ + // ensure musa context is valid before calling any MUSA APIs, e.g. before getPointer + // calls muPointerGetAttributes + ensureMusaContext(); + + int gridX, gridY, gridZ; + uint64_t _stream; + uint64_t _function; + PyObject *launch_enter_hook = NULL; + PyObject *launch_exit_hook = NULL; + PyObject *kernel_metadata = NULL; + PyObject *launch_metadata = NULL; + {' '.join([f"{_parse_type(ty)} _arg{i}; " for i, ty in signature.items()])} + if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function, + &kernel_metadata, &launch_metadata, + &launch_enter_hook, &launch_exit_hook {args_list})) {{ + return NULL; + }} + + int num_warps, num_ctas, shared_memory; + if (!PyArg_ParseTuple(kernel_metadata, \"iii\", &num_warps, &num_ctas, &shared_memory)) {{ + PyErr_SetString(PyExc_TypeError, \"kernel_metadata must be a tuple\"); + return NULL; + }} + + if (launch_enter_hook != Py_None){{ + PyObject* args = Py_BuildValue(\"(O)\", launch_metadata); + PyObject* ret = PyObject_CallObject(launch_enter_hook, args); + Py_DECREF(args); + if (!ret) + return NULL; + }} + + {'; '.join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" for i in params if signature[i][0] == "*"])}; +{packed_decls_src} +{packed_inits_src} + Py_BEGIN_ALLOW_THREADS; + _launch(gridX, gridY, gridZ, num_warps, num_ctas, shared_memory, (MUstream)_stream, (MUfunction)_function{', ' + ', '.join(launch_args) if len(launch_args) > 0 else ''}); + Py_END_ALLOW_THREADS; + if (PyErr_Occurred()) {{ + return NULL; + }} + + if(launch_exit_hook != Py_None){{ + PyObject* args = Py_BuildValue(\"(O)\", launch_metadata); + PyObject* ret = PyObject_CallObject(launch_exit_hook, args); + Py_DECREF(args); + if (!ret) + return NULL; + }} + + Py_INCREF(Py_None); + return Py_None; +}} + +static PyMethodDef ModuleMethods[] = {{ + {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, + {{NULL, NULL, 0, NULL}} +}}; + +static struct PyModuleDef ModuleDef = {{ + PyModuleDef_HEAD_INIT, + "__triton_launcher", + NULL, + -1, + ModuleMethods +}}; + +PyMODINIT_FUNC PyInit___triton_launcher(void) {{ + PyObject *m = PyModule_Create(&ModuleDef); + if(m == NULL) {{ + return NULL; + }} + PyModule_AddFunctions(m, ModuleMethods); + return m; +}} +""" + return src + + +class MusaLauncher(object): + + def __init__(self, src, metadata): + ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()} + constants = src.constants if hasattr(src, "constants") else dict() + + def cst_key(i): + if isinstance(i, str): + return src.fn.arg_names.index(i) + if isinstance(i, tuple) and len(i) == 1: + return i[0] + return i + + constants = {cst_key(key): value for key, value in constants.items()} + signature = {cst_key(key): value for key, value in src.signature.items()} + + ordered_sig_keys = sorted(signature.keys()) + self._signature_types = [signature[key] for key in ordered_sig_keys] + self._has_structured_args = any(isinstance(ty, tuple) for ty in self._signature_types) + self._has_tensordesc = any( + _parse_tensordesc_type(ty) is not None for ty in self._walk_signature_types(self._signature_types)) + self._needs_runtime_expansion = self._has_structured_args or self._has_tensordesc + self._tensordesc_meta = getattr(metadata, "tensordesc_meta", None) + self._tensordesc_keepalive = [] + self._tensordesc_object_cache = OrderedDict() + self._tensordesc_cache = OrderedDict() + + expanded_signature_types, expanded_index = _expand_signature_tree(self._signature_types, self._tensordesc_meta) + expanded_signature = {idx: ty for idx, ty in enumerate(expanded_signature_types)} + expanded_constants = {} + for key, value in constants.items(): + path = _normalize_arg_path(key) + if path not in expanded_index: + continue + for expanded_pos in expanded_index[path]: + expanded_constants[expanded_pos] = value + + expanded_ids = {"ids_of_const_exprs": tuple()} + if ids["ids_of_const_exprs"]: + expanded_constexpr_ids = [] + for key in ids["ids_of_const_exprs"]: + path = _normalize_arg_path(key) + expanded_constexpr_ids.extend(expanded_index.get(path, ())) + expanded_ids = {"ids_of_const_exprs": tuple(expanded_constexpr_ids)} + + target = getattr(metadata, "target", None) + target_warp_size = getattr(target, "warp_size", None) + warp_size = int(target_warp_size) if target_warp_size else 32 + src = make_launcher(expanded_constants, expanded_signature, expanded_ids, warp_size) + mod = compile_module_from_src( + src=src, + name="__triton_launcher", + include_dirs=_musa_include_dirs(), + library_dirs=_library_dirs(), + libraries=["musa"], + ) + self.launch = mod.launch + + @staticmethod + def _walk_signature_types(signature_types): + for ty in signature_types: + if isinstance(ty, tuple): + yield from MusaLauncher._walk_signature_types(ty) + else: + yield ty + + def _expand_tensordesc_arg(self, arg, ty, tensordesc_idx): + _, rank = _parse_tensordesc_type(ty) + desc_meta = None + if self._tensordesc_meta is not None and tensordesc_idx < len(self._tensordesc_meta): + desc_meta = self._tensordesc_meta[tensordesc_idx] + + cached = None + cache_key = _make_tensordesc_cache_key(arg, rank, desc_meta) + object_cache_key = None + object_ref = None + try: + object_ref = weakref.ref(arg) + object_cache_key = (id(arg), cache_key) + except TypeError: + object_ref = None + + if object_cache_key is not None: + cached = self._tensordesc_object_cache.get(object_cache_key) + if cached is not None: + cached_ref, cached_base_ptr, expanded_arg_values, keepalive = cached + current_base = getattr(getattr(arg, "base", None), "data_ptr", None) + current_base_ptr = int(current_base()) if current_base is not None else None + if cached_ref() is arg and current_base_ptr == cached_base_ptr: + self._tensordesc_object_cache.move_to_end(object_cache_key) + cached = (expanded_arg_values, keepalive) + else: + self._tensordesc_object_cache.pop(object_cache_key, None) + cached = None + + if cached is None: + cached = self._tensordesc_cache.get(cache_key) if cache_key is not None else None + if cached is None: + expanded_arg_values, keepalive = _expand_tensordesc_kernel_arg(arg, rank, desc_meta) + expanded_arg_values = tuple(expanded_arg_values) + if object_cache_key is not None: + self._tensordesc_object_cache[object_cache_key] = ( + object_ref, + int(arg.base.data_ptr()), + expanded_arg_values, + keepalive, + ) + self._tensordesc_object_cache.move_to_end(object_cache_key) + if len(self._tensordesc_object_cache) > _TENSORDESC_CACHE_LIMIT: + self._tensordesc_object_cache.popitem(last=False) + if cache_key is not None: + # Reuse encoded descriptors across launches of the same tensor/view + # so repeated TME kernels do not re-encode and synchronize every + # descriptor argument on the host path. + cached = (expanded_arg_values, keepalive) + self._tensordesc_cache[cache_key] = cached + self._tensordesc_cache.move_to_end(cache_key) + if len(self._tensordesc_cache) > _TENSORDESC_CACHE_LIMIT: + self._tensordesc_cache.popitem(last=False) + else: + expanded_arg_values, keepalive = cached + if cache_key is not None: + self._tensordesc_cache.move_to_end(cache_key) + return expanded_arg_values, keepalive + + def _expand_runtime_arg(self, arg, ty, expanded_kernel_args, launch_keepalive, tensordesc_state): + if isinstance(ty, tuple): + if not isinstance(arg, tuple): + raise RuntimeError("launcher tuple argument does not match structured signature") + if len(arg) != len(ty): + raise RuntimeError("launcher tuple argument arity mismatch") + for child_arg, child_ty in zip(arg, ty): + self._expand_runtime_arg(child_arg, child_ty, expanded_kernel_args, launch_keepalive, tensordesc_state) + return + + desc_info = _parse_tensordesc_type(ty) + if desc_info is None: + expanded_kernel_args.append(arg) + return + + expanded_arg_values, keepalive = self._expand_tensordesc_arg(arg, ty, tensordesc_state[0]) + tensordesc_state[0] += 1 + expanded_kernel_args.extend(expanded_arg_values) + launch_keepalive.append(keepalive) + + def __call__(self, *args, **kwargs): + if not self._needs_runtime_expansion: + self.launch(*args, **kwargs) + return + + # launch(gridX, gridY, gridZ, stream, function, kernel_metadata, + # launch_metadata, launch_enter_hook, launch_exit_hook, *kernel_args) + launch_prefix = args[:9] + kernel_args = args[9:] + if len(kernel_args) != len(self._signature_types): + raise RuntimeError("launcher argument count mismatch while expanding tensor descriptors") + + expanded_kernel_args = [] + launch_keepalive = [] + tensordesc_state = [0] + for arg, ty in zip(kernel_args, self._signature_types): + self._expand_runtime_arg(arg, ty, expanded_kernel_args, launch_keepalive, tensordesc_state) + + self._tensordesc_keepalive.extend(launch_keepalive) + if len(self._tensordesc_keepalive) > 4096: + self._tensordesc_keepalive = self._tensordesc_keepalive[-4096:] + self.launch(*launch_prefix, *expanded_kernel_args, **kwargs) + + +class MusaDriver(DriverBase): + + def __init__(self): + self.utils = MusaUtils() + self.launcher_cls = MusaLauncher + import torch + if not hasattr(torch, "musa"): + raise RuntimeError("torch.musa is not available") + self._torch = torch + + @staticmethod + def is_active(): + try: + import torch + return hasattr(torch, "musa") and torch.musa.is_available() + except Exception: + return False + + def map_python_to_cpp_type(self, ty: str) -> str: + return ty_to_cpp(ty) + + def get_current_target(self): + arch = knobs.runtime.override_arch or os.getenv("TRITON_MUSA_ARCH") or "ph1" + warp_size = 32 + return GPUTarget("musa", arch, warp_size) + + def get_active_torch_device(self): + return self._torch.device("musa", self.get_current_device()) + + def get_current_device(self): + return self._torch.musa.current_device() + + def set_current_device(self, device): + self._torch.musa.set_device(device) + + def get_current_stream(self, device): + stream = self._torch.musa.current_stream(device) + return getattr(stream, "musa_stream", getattr(stream, "cuda_stream", stream)) + + def get_device_interface(self): + return self._torch.musa + + def get_benchmarker(self): + from triton.testing import do_bench + return do_bench + + def get_empty_cache_for_benchmark(self): + cache_size = 256 * 1024 * 1024 + return self._torch.empty(int(cache_size // 4), dtype=self._torch.int, device="musa") + + def clear_cache(self, cache): + cache.zero_() diff --git a/third_party/mthreads/backend/lib/libdevice.31.bc b/third_party/mthreads/backend/lib/libdevice.31.bc new file mode 100644 index 0000000000000000000000000000000000000000..6833ca5d016a12fe5e70447af2de81b37d6d31bb GIT binary patch literal 372556 zcmeFa2UHVV*EfDrNCJi=6fsC~C?cXlC?W_*Am~L!MX%mRy_O(|sGx`qu`mz>L_|bU zz!nf2dKG(@W(CkuTgjSYT$+oLNgJXlDvh zJhNvxVlIgmH+a$}*#3qzG%WL@rv^!A5t+TzEMv9w2Fmpp(?*DWWx+x}56wb9CbNh4 zbxT}J&@49|aUcp`BlM%uHD<;1?hzEP6^_YGM@-I_C2`F-#iD3m_6YB+n5}TIUMt1H zq+bz--bZMnuA4B6*>_}6f>wrkrqu{MCFY1Q6ZI&W6{CiN)@z}Ga_bS&@%ZI5nwX}e z8_1Lv6=AU#U0hhBCu$6p{Y`(0OB5UoZJ3jleX6nzjXPN~3gwH{dk zOuf|N6fJ=-<`akz&Yf#Vr=u%3IGoAJ{Ujh1(V|Atd+3c2T#X))?#Ep}YGR4Ne{ycP zWIrL^fVt2zwf{9NoKda$mmQ;r9_K)+J{(Gprr`Kmty#ygDf^@z^_p}$ubaBA3S7bI z(Q5j7oMM5se=b%8pzN}53bH2Wip3X8bEPzT%WRmjKp-+p?k_GZ74H+%Oao)pdOS_G zHOrkWE)I&)*Bj_N;wJYP%xK8nQci$gxxE4?8gB1N({vc5uBMTeIW>wNa0~R5EA^-> z^_AxvR0INBAHfiNI1_tOKNHdEb7JVa-aE835Ed3F_C>UGd&XKihqF^3>59dTrTJNi zJ_~4i6z|m1)5XGLL2C%5%weK|@w@c(faWuo27v8iZVLwLVVI=76v1H)T;f{1TR$8K z(1u)GP2E94)8oX;LF<#nEKM*HJ|w1GfY88jiN+Wrj?u#SOm&njSxme}NOK`4tAF>) zk>3 zNE5~H)7P_Ph#ha`iW#VIZY{5_v((pN7~nyHHh4K5PujXK!h!)I(8&bxM%5r6?-iLgXnr_?cky`3+db@5&f9&6#FfMd*Edm zem+x9Y*{AsKtX@$-#@M<4!I=?9Q0?zXp37?J8P6OKi1ZwR{Mz`GfK1ghr8%KGRoog6GHy`O&)j5wf+= z8S0B0kE4|v>_y_usaa^!*aG@_pWeDSeeS+d<_WX*;`G%Pb`hcJUOIsjMwk>kOd9P$ z8#P9EBf?9yAnrFZQ$|b}Jz>d?&0;7gZ!#Y}M-15-4FciPco4q`UvCJ}VF^9b@h_TW#Zjn50 zd2*cN*|;SbJ3`DxJXC1(G*hzyn9z)?O^hBtH>{Ey6-F7C)8yAYQH!O)Vu96y7XBy+n@VJ?2 zGAG1|;ILOX#f8b^9N`iUJK_L)g=O3dbYE^%{k&H_#=|vZ1SmVVK==)&7KzO9DzN}z zEQ3T- z^J0r(%Jg0jdKi~w8a8BV!X@A>jUL7& ze#ZFVAa;1`TIYC%lraV)=H-s)nAX2#EW}37dGE?uAHm{?-<-^Q8_EkQ=RFtm8=&C3 z-UW+aaRogm77aZ~b1g1%DVI1WGTbhLfufx5%AC_A&S3gd$Y0YfHV~x=2+^BFCuInfe z(2@&TrdGg8?9CEu7$nZ^_Hfp4l#SvGSXtU=Sfz9_U95o%gGEQ^n3|aDSjqfMh-FkG zZC_Rl%*g4a#2My$Q8M%_IH$}Nfj__X<)Occa0togy$Iz!&*z0m`7cDgkIlT6a@Ln* zULB#de3}>{QW90xCHF=2?H?v{N6vJBbV0iI5DX{h?b0N zG%yMc>r4!*G7Z5qpMtAL8JBn%qvhf3<(6zbC~iq;oTD@@%qcEZ8t0h%GYiR9db7g_ z_pV0R5V03q zLQa#d@Mf>JWFx+^hhL)^H`0DMHV(-!ISz;;f*6Y}A+q9tc~}}-s#!%dUgO8BL6vfy zDq>oEoZ&5oj-Z_Ve~Y~eD(Sc`3T710&}rJM?nHebC8K$anyGzE<;b!k3uEqQp7$0}JD06}IdlDR5 zVvs9w-XU?yl^9h14C(^6Y=$%8t1G1cIhkR*P)1CO@MO+n$h|>vVNfo^!D+$G*eeit z=3~gHSww-Z!i`FvL(Vj+%7>h3*znv4Qtn$5qhg}w#xtDr|8sJ<2aA}MJF1Ayu>AjN zf@utA#91C5=Lp>ehaHX+alJT5whUSVF&lz6(-tYVW`xK@gdhV`_`vL8K(qgl%cym7UekX}QpM#3#H{#=_Al770^`tUNitcU0)auoU`{l8=$jiQ zU~*vbGZOmbIq6b`#1?qR>-|G zL`+Y+C~-_(x|r$N;$xX?xP!psJ4{n7CLkJD*NOG`AYj1jplDl`XN&l6CjEI#fTx(H!-~;)<6MMBPvnCIM>fI^%VBk~Z-W<}$yLZ*r4U8yJLBLK`M=6v0eIx&N#GIhE> zM~CGAS26TKV{}bIKe{(0V>k)wG0CoXnNb1@E~pDj2#B^23~v$ZFbgs;-5bm(OAA4e z7zBBE?^9>==G1v{J+;8hSZ< znV1~`J^IfVa8jUpM)4Mu4XFJ8KrCWIJ%_gzV{D)SKJbD#1AKgc@d-)heZLk$TEJc= zXD^bnpS)09Q;o@7t8JjepK8oE zXk7r=gBXMwZ(K8D6}_f0G2(<643XXJTP(%hq>3vPraWovWqZX~Ot9>pDIK>L2G}Uu zB43|-Tt!MXm-SEu24WR8sc0L?qCRWWl_qkaz;~sJ!<0#rFPRh5{NYh^M8z|-m0S+_ z=U$3YCb$-=${~^VMM*@(N^S^Nr!o=b?|n{%!wq@ZNQEB$3!gEUmGq_k(sr95!Q%^} z6G6~zj@6v2xhoNka8d<_8^dB|RlDK8lo`{<)Apk>h6;YBa1|dzz_r^V@8f*Ad29hr z9K+eB+ zQ8PrYhz>7*4BRBO#3usYB6LX%kS@vDmHrVtS)pOYNfbLVRPqpx8Qqc94a|E^1q2eS zJNk$>gcF{WWkZkA4PJ4jD0{;0gM4_4kPibx0PLRoB| zNn>h%OzMwl-82fq{N$V?2iz5a6xzTQz~+s~gm-C58nb$uF$xG&fW4cP3Xa~bRRuHd zL2vG#Y6xj4YwUgH3jn683@pRGv?mGJsd<%F@QI1CDd#=5{ue@WWsRj?_B(=@IQlgt zR_PaJ0WgNL#^&z7N(MIKT>=2(Dg%49ORopXlv(b<@S%{UvMH}d7fk9w!r!~7`aFf3 zpZp0~Xpn7wAY^70r8NnsokB>gtlOMT+#`P~JQOCPFaFO7dO`WwR3l;Q0?;_(L$dUE2z?jg$V{SbG8;8uaR zGq^lkQL~iNLXabVAMz&5fz<{fX0s4~U=<~9i z$=cIrC%O^AiYv&(1U!u-2B!}W5%6;a03aU)*f1&VVJ;xH6R=}lOEHA4Vl z_j7O$mN|#kfqW+XM+^WIJMojD!z+NNID&HoJgZGS(AeW$e;HF~S=SB{T5fks$ zokEPn>I)BxgqOXA=)uo^c znsR7NbOU&#&^Ti(Y5^#8_>UM0<1XUwa7J`7y#78Qt6yZtL#D@-;aNRnu*?w(QieCF z#iIy(4}<@Rp~%=NZ8#0OsNTxH*S1PbshD-c(g=8j?uh|_VkhqHGZ8)z6DyA390AYj z3EyxN_@1)%dbYE5p`k%E9R5$7m3OpG-7FsJWWq9)>e!l}efik?zEb%cw zp{=aF@bU6#(8#GP!)vw-wgOBl+Dp~7O9c!n@TO>_jfRL+foHRpYeC9;leT;I(?I|Z zBWCekd*eJ#06dk%a_oToA;43~d!-Z0g8`mO-jiD`aRPX10FW5)Qj#;HXYZYeF7G}= zoQ1S}Uc+b{_?a@i#(=eD{SeAk*52^bbS6|1u`;}@KMN?nx9&*!2JlA}cx(PuUIH>z zBDZZo&}_h@qCI`RlgAZM;SIn>z?)>y;Z3^fIRxOT zght97kI_&`td)&>>?Hmdz~d_$w|HuE6~N;HKw`kl*oj$#oJ?sbUpentRqE|J0|sIc zy8I6B-ohStry}^o3@1)?!<%<`?Z80@omGZ+wZ?NBz?-J5y-|y!Bw$=qWq7IQ^Unba z9c6fh(;Mo@#{FhJaS0SU76BFp0FvaS_ZoN}ASlC|RMjJGI{1>Zajib8X+WVXSB5vK zgT5E)aIgxz$jm~}o`W*Hwme_L;0DU@0z)fX0R>N4dkdVr)&M3Icf~J(U*Eob7>wr*$vi45xVn});v_x5Z=Wj=uXd*OJS$i8A@{?Hz z87XUT#uBMF1K}##a~UeihPw)EW#if#?Sq<&DNOy{_fmK9Zb5gVQga(8#!-se@Iw-N zwitTApF@;g(}*6SKbW9<&JExZ&fPr`(3m8r z)mTn8?xcABZHjTPvFAcV)l6Is14GbI^{uD+Uc?8g?|o_$P+Ed%tK~#P^~-3GmTQJN zbOLYXw4AfNPwzDl5-K$}kDmmQn@_|&3^Ad-Q7_nW+8oBPS2Nd@Mm9+P)4<$5ucJKxBNRgY+#hycMNB2g@P>fsXQAjavx`{1Cdo9US z-`nTyO*XF8H1mvmpdw}MO>)-OCby%F#_OnwC51`!dn#*<=#g3PAvA_5NA4%{zbTP> zHwCBoUc^XmvT?2MoA0_0)>DReZ-F)s+R1(B6dLn3uCp*At<=_*#B7FeQE3T|&#s}s zOSSi_1b7@}c=yw#b&yq5+R-mQJ{UxPcwJ;a;S}xFkD;t{bJ{F2DB6qHNpuA#P==RqRh_E6S}%(B ztQONp13Z<6>XS1S-XA-q6nJOGI+2Z=6V@+{3~$m!oj=INZLCut0XH$E%2i|V2dZ&3 zRz_2dd&=!^iuOM4!YS}_#<^19J?@vbl$2N|wNbTa^-aAW+4nwmF)7BKT0jkr=h7%> z+*Ly(KUWN$F?bJC;Xv?q?8KnSx-Utgk>fZ}?=iHa%6YG+vEC@M_D1Qhwj}Qq&+LWL6p##l%!V<%N~_(*=o%CMp*W! z+yZ*B|LPWUVoA7HNb$X|^QgX8c?xczk&N3m)!{OYDOOXMTk)5~YV#mtvi2t3(%VSZ zUPAEt*dfH!#ER=Mh!XFfXVf;6JC?>ro|MFLX~EwO7zG&J_Dr{ukRch*w!-qhFBs_|;6)+T@_RcV9oS)0NN5ipD4Z$uP* zsrS`K!@YVd0zs+#Rb7aOH68Rt5c)81_+ye9*-L9K9I%M%U?^^mMZ7mP1w_d83yw9IjRY+0tgJ16N;B0l9o;09>7~0#f7i`hgHlLKTpEBNs(MWCtjNv=TUo z-h)z9K(3Xsh*XiN0T*Rqt&uMPC(=?ZX)o09m2}@=K7k5w6x()?{N} zP2xtiNCo81y;11^a*i^{oID4+kI-eRg4{l0AAn3#0aZ;RSat3DK~iemC{uV4z6^2K6YYou+vA<(#*=uQ3!Wyag{38-+%+~RLVq2 z>IQRikzIN-)(*rn5vG`iBAhkmNGN6EuM@{8E61jX-jp)Y6>Ldfn&oV9+(J%t_k0X) z8j|7d*Bnmnpw`~MRtWK+vNX$>Z$^Rl{!}Ifp3fs+3cTEqffRU898<{fdaf`SNzTJL zmo#US7q&~Eji9tBri+Rw_k3=gjiU4bOCEdIgV9ui$bW%3rA#;u+2TwJjnth6lnzSc zt|p~Lx!TC0e27-FR7Po0GB41G46G6w%}*4I3zZ^kzEDgqvQ`;GC>>PKN`qu_51{e( zAK-nV!ke|43h()4D!lS%RCpi%0iL!i6&}}^3XjXDYA=>e)m{-5US*hre^Ni7;XH#U%$1F5%v`$(>m6JD9MLqD&g zmNy@@zCt}$QA=feJY}u7dbJm2wen!tK1i-mNEG!HG8=sC72f9#WG|I6lyI5v4PAk+ zO6#(z)s|w81)XUWbIe=gLsm=9Fn5aFo`zdDUq`Rm-7GrtT54;b3bNG_zVTm?tu{N_JB9325guNY8>cU@Xi9x46pWM0jGnkYLQnTXs^g;0L6XjlRf$k>akscMfa}!+KVfCSsTQtULQ5dex z@s5GjnMy7#nZ)#jHJ3{8?j4|=$sz^s$DUCTX{vIq)_YNI@9!K=neU`WroWTDRAY%& zJm_9EcnhpeKarEhm0szT`F>hY>AtK4BeWr{=d0LP!;EU*Sgp07Qz?-VVPHh(f3GN> z(i{%;_oB2nS8b0sliQnC;dn}J*_Sww>;^fLN9e95x!*l&v-M>6E7`H>1=;<2-sZ^2 zR%@JNPKl0mhb&5TNPAO**Led*?!IzzM`*7h!AqTLMm!^wPuLuWm{^!t9o<9;-o91w zl;E8e5TU5 zb*5_Xso{7^Zds8;^}Pwd8McvpFZHll0(n`N@L&^VS-13UF=bhovfO+Vzz%3Of0l@fT7Gy7$B^84RAQvK}A0+KZA3UJNs(q=Nf>Hc?W+ z*NxtkRM5-k$V`&lRDBELsX~=HUc%qZ_9CZ(Q?pv+u$~N7>04&_SbhXA)l?2%wT; zTBP}ISU{ccydeb?xo@_MLC-KxrTKmvLnJ3m#c)0PrNonbrZG+@p5il?7THne>*btI zk$Y-~&pAax`z6&J(?e_~}f7`0btBv~z2@2sP!CG52qMJ-Xw3Mu7d zvP%@@fsKnh#hqY&l?3>CEJn`HJtO-G$oVKB?##^kz$6~n zPR=0|~dc9-HINfmfuO#zeiJjnw0=TbW9w%do@;UXF{A3jm}F`#U*dh#uBV(LGJ)caFx$!t@? z;+W!`LM?W~U!;JlR36U!logf<_p&vWd$5M_=O)6Bi>Tbw7;tdfK=3C+6>k}sI(8WG z47FGx10;dmZKKR|NKYTl8nbtkItEYB{~W>zFZNa3028U)$7}k1ZUD?z>9dpAiDo%) zn~{lw=pRF3C7hHmwjexLtT=}ea@F>02#NcaDxdsHC(#(`K%;E9o)@j87~E-8(M;ICD7G$K27s|Q|6a{m5f7p+xZ&Gd|Rhe;kg`j5W$m-Dn6s}re}{4q=(pFd>N%?20xDjHX(+>S2I?> zh=#?6}buWKW2= zIy(pCR;X0WnzI2z=uDgd_kBomR!=)^NTZ8?oB%&DNgZSTXA+`ezs^ByTK75#^MwzH zy2Dd3kqz~xZ{WpTl~PzWE!&ueFqKlcbk}G-K%K8rTg6K<3*jrXLw|{ZuQP10mJ%^> zSB0`+$-`nY<-J4J{-HowsX|$|^nDqbvS+FFlnm0dxK<&(tj6HPbCW-ZaKfiIv;G1f zS*|jjhttmvrqSWa+n)oH!{zlaK=fl3(WAm&RfA~gK7I_)q|j#T4sz&i+?F&JC!W#$ zIfN7Lwaj`5qE({){)$)4WQILQ*=*fKVpy_&Thcf(!#AH&+JK?T^UOgj&vlX+-ixr= z27Q^7hAYsB!AQxYu525{C*|R#wd?mno`X-%z~C?-PjC-bt;(c)dn@yD=zKcLH&V_v zIS;nkPu9WU8M|L2cA|OS5;=sGu`;DqQ!fY9lT3LqZwZBGN#`~PDo>l@a0<^HP5r@C zo@a^`JXNG87^nXOPq!NcPqE5V!IuK=qKRakr}is4NAVxQ?27y`WXgm6@2rE5d7#f# z%6PxezAL8iygJ|xg{RfF+4reDUks|H@_aPAlFD;~NWpWZvUDp^;6L#E%TB>lU)kst z>3)pyG^a_|=)3drcyfmD$9zS$d-L0Yj59H}7pR)#Jkfy@aFuF9u? zJQr{$oec8+@kNZ^$skj=Ywjm&GUs-^=0~z7d!{=Y<&ZRKHPOIHOwr`~swIa=Ag#8A zR7fY0-SSCD?I|+Id1uDU{vd<=l(*rG70KL<&jU68P$MP4o>oI<9wLEk++pAtL;-34 zB%BN~=j?*}RFLnNeV~B+e8G+ zy!U0y9H4}cWR=q3lDxT$KgpC!qm$w&lqKbh7}LmxyT!ZU_vgwM zlPOcrCvBrpmOWS`TSBIsR5bPhMS5N3T-h=*<)@oTUnrFA4;RUnlPOQ#Ponai@Mr~< z=h|mwRGteSM^Sk$Z5-RrhCKJ;CvzDq$kKa$NcxRJ`Sj@`Mg*DiUPls@=jUfDs66+^ z-lp=L@IH#lGj-!e>f9H!{KMR5XWXXFz2w6`%ze_HjZ~cVAO8Vo-y{Ek)ARE`;Ou#3 zV;n_GGrr7atRidaYToVr6v}0-ix{iPl&K}RsXWWuR#15^y~7ta^gsyK>Pq$Z!kTLF zOma%x`K7iEC?i!UwIi$YDU>(bK16ZJl#?SLqX06`uN@yKJgs&|KBe+J^^L*87%0Dw za*verwT~0hC-+EhoYnAsCR~LwfuA`fzZaSEH76?SGA!TN%IlP=BRRZ}5#yz59vmS)xYAk^hvi;uMW6U3<(iF zVpOuxnzh0d8j|vmSkXyh(7Af29`|Ly@<-)6bjPP0h=Nv3I-KL4nP>!JOe>HDj-O7dn_T08Sk@X8qDK7Yt<8ImABm}k=( zStWbdwXh|+Uc2_ty92o)W^dLsUYp<5lw8v?Ag#U0df3hmiNWj0%r6Hcw&Z=YTm5$Z z?XTyPeC{|L?;851)3T$NWyEXF4q5QAjgh1C^CAqSt=|%jJ})kx9gaRwFy&vYikPP7%_7$;is z$8?J{julj#KbijhL`xUyekb~ydtzTi5LHHpZ|EwTOqWJ~yd4+vY`49;Kz58NV0v^i zJOp)l@Wo%|^TFS)X``pRug@@J`Q%#Mw1 zZy$UUWI4Ch*N$z>b(fnR+%xFic9Ylqz%iH2244F%xn0t}|M~V0CNnmF6TNc?I1A$f8n2W`ve{^Sqqy3CaHRq3y&bhyLRnjbBRu6S;4x7bNgTeeh7^Fe}ZwBq(9A@yW?b@5R z{g`H`udkfOlsEXDGU5j@SUYv_sh=YckC1B$SB*s_57P8oaYw0ODD6OfY11FfD!ib9 zDGxX}_$ooXR|ikoz2LMv5bqd^Gmpi$O`RR|Byei03tZGbf0b z(S7$l*j`N#KVHW${wvS8qv_PMT`ntxho;Ow^+(Y@?$}qkH#`2ga&AJ?xYRqpg^xKm6P&D<_pYrS8^npjf5c_iFK6pP#1Q;x z>7suR%uaXTDBI{wTjo6F-I`fCd{!d*?O$6aK63Z@8YwhG9uS!D=Ubm^oN(cFKjN3; zzHPIOHth4KaLv7`W4inj<#YZVx;IjIa(<+~nQ%{aPo@AN&<$?+SUf<(psmk*|J1!{(iGIJqr>YUvu!Ri&NgQ>XHlVJ}~;jy=8X|tJ{eT$L6)DC&g=YDG_w0M`Gyiv;J`GnmC`!0KeChI&!*|Vo z)X^t&$?%bXJ_V4Z-3iv^V;S3 z{?dINIp2y5-kzLWWI26Lmv+&z{?}hW+*;?J9Hq$7gg0F2qJ6j0`$+xK;2}%8dM>T) z>?0O1pF|FS;?%Y67<1*S7k|F&aJ~Pfz<6fc{kCBnKdsKx^N-^Es0N+xXFCk`4BGwn zTsnVgS(SUp?$3E$5}T)sUY|ET+tR9?RK4Y7%( zy*}c&LqvOrd!6g*vXIh^ebf17x6cHh+r0+JE_7JDtv^||t*tq$rnBn#)Gr+16wxMkn0d7-hpE6v7j-phsDoWCx2Np2SX zWwo`zWqq4;yZ_vjw{shT%s>A8SD^PE8F^z}|@ z`Ji}vx=W}xF?^zRqMvtxHQe? zedN}*KWjf<>S7G7aGd+}lGWz6m4Jva?NA_XoA^K9FuxA9OT#=Yyt8U9#Xa zLG7~DO)afqT`wCOo7Ox9)1STZ;hb|lki9s#Yf<>N+Dly$m$H+t>tEfDZNp**>1Q~< zdDP*4;4p9f>f#cobQ@PreUA5@^Ow4cjwCGf{``JQRcqF@LmfpyK|t1&>wIUi%ZcY3 zZH63-9lWyR7;xhr>D%MM*Rw^f<@;)a3jG?YKp}I=3f=TG7N_&ia2k)T&dLF@%d)Jh zU9R)rwW94ui(gxX09n56t0H&fF86f)<#Bm~E^plfGktvY)05ZEK3`fxuUWYqdj0J| zhx=o*UH3n&{^FF*9~AM<{ZQ2^nCZGwgZ9TN>?ZKTumOV6L-`gD>Ytn48 zc60{ zY?%!+ttz|x>`tp;Qrm>%gWYP{M+4c}zt-H$ZY`UYW+UajJe;)0AIPk$f^R&y@OpFG zglh+;#@6%)vgAWIcFU(0g>T@0$f?hcIOGRpz4xs8+VS=A=C+FVBavloTp)`-yuEDC z^qY_zB14|#?n;otOgnNe-01jvuC}%1+P-=12fp5c{FWs=KW}x5MVgKIl%&CtdG4UM z`FTs)t~c@9TRq#sAXKt(pUbVA7j|^Ch8>r`d3j6-*bbX@X3YC~Voo|g zOe4@+d#W}Ez)W|Z`JDJ} z+4;?Fc14TFo(P-=9Pebl?-g(EVqjnx}oBsMa|J#cvNo~2?g9gP8z71rZk(ZAjDXt1hvKgp<=ygKI zLLfW%tp4`t+^-$2(wCQdpNKF5M@nqEdHsa@J_wn8T^m>2II;w$(0ljU=?TlLp0-Z7 z{JyN_@(LiU%Q4uurfuW%B=EPQRnB`N09#f2D}xgamYdt?H|?M0Sv>}_!!4CnuOH_u zNVAFPQ!#KxyBw^bf7^dXQP-oERTiZ$H9IYAgS|WcrD3iI8dnA!c1ozU2$P+ zN5lQr<9U0>wOF}9;=;q8H4ZaQ>6>m7G4IgjgXh}_n{K{PQgk__u{EsZ%-8x09YA*X znDD{Uk_$HJHaUg~R=lddkfL8djhxcb9{ITyyU;u)tG$>=s(YvG``YnkP`XY3Rp(c4 zud#z-G%e|JLhY#wA6oP8oJ_dzZU?03@}{o+3+5$#N#nPBcXd6wFNIA0sLC$!n{9Y( zTW-$BKc8ktLS5K#;>KpznP-M2@t5vCBcFF>A&{NV2$*;M#rc=5$w_tZ&YP`(DpR~* z`-e=Y9CJ_z|J|voSSUtCE*)K6cjiCZ*j9M7rpMN$E1+g&=bYc=I^ztMZqqX0Eq0^m zD3CpjYj=9oJo$5LXv(Xv=WFAk0$vnpq=<%HcSy5w;+Hj*v{Zx8KR25lP0F#}*ydYt z_(`18Zm6e+&s#^IKG*1%#;>kS3SE=(0l4w=8iu6=gvi=jUe~XFd9f6#bN1VY3bz%9 z4yM`6O*nEiVUGp0E5Qk$IM?OVKetM|j&8WUQvCx&z5UUbL65f$OXHh4Usx5HGc>P@ z(U~$QGNmL zeRtpsDCFF**UtwY1#Cm!JpEj@buTm+9kKo!HxG7R--dE*7oIuSqoc$9r2CqUEfLS& zr`b4k)}IUMx&U3x?K1xbr{|P>X~nk(?3P)qfsQM?&ag_lZky8v{`;F9;Tvi`fKX^? zz3)x^i}=$z^BSkLg6|Kj5?xp}M^{}PzDwKSu4O@HpFSUBJhE9$2t^ZbZ~LsiDL-^<5+ zO>ZI=4zuq(c`)b8@HSXDI8GOSy*0!A-fDefS#o8C!L8@o=GQns+G(bGgzX=-n_m^j z?ajH|cjV)m!`~trTHHrz4}E;!=|G0xx|A5+{#=7Qj?+@iqb@h(ytMds`>S`SbL8iC zr;7P*+Vj@jT-hgX{)Hsw*s_(k-dTV9W?GQ2IzDT5pS)wvSSJ|Y*o>M{t7Zq;mgQ41-EESbGM0t z4CACYbdv#}F21h+;T}I%PQTV{(2;kq-_*aqaPXjiKQle~JK1vs^Y3Isx!=jo&XIg4 zd;9)7nM3V_ZnBtXGxWO2=IgS5Q;kGWEpN#j`bzTroDTj^Kv9X z_F(0gD-TnEY{8pLc}16j%;9#}+pp&bs|(Eib^0yT`Z9(d6SApux{K@c&8Rl~$*V73 zwSvBtn|9B4n)GhP2z9zIeXXzJiH&1Igm%JBQ+p0>c;&fapf1fVUg8u}l<2A0j$(1z zB5pn6cW**t&-FAeBzCYc3>2v;$YchvAr0(I16!Mvh9|;+ehp2BW1CPsyR#(|=B#O4a{)Aq0f2kegmyW^BL z>n@YBR|c_%5IYvIhv3BK7NJu1GE4TdRyAO|9QN;lJu`%`#YNnTw5Wl^E+f^vPx-ti z5o1;{v0F^{o-u}x{fHmgzY=9s;>Wt5OzgSRz4MCm*&-*{`6iwL`|;R-(9UR=y9}4y zBJTCMaiNpr!a`+BU~3I&+!A5j5_z0{+3) z=&Yd;djOQ07&ZH`Ca#q^l@__=O58vporSJ-5|?;5EK0np0W*zT4tQWs0!wySTGX^! z_}zclQpK0|L1b1?;Q<@x^s0Pr^b`Pi>$~L2+)m02V6UJ=*xIMa^=yGNY(jIF;aci_ zauMvhBxNsBY&xR<=o+s{%70VMYF6xn0()J+Mgj|9pA``gwnh0+&4P_v>U?>vBD0z% zKVpA~%ICeBOpIVNo+~CsV8I5HK7rUymx~g&vkcgO$hma#MX)|h7PeN3@Fq5V(l3kS zy%6x46=>gyc(6wc?A=1#r(Xg0>E-Zmn6Wn|iRbF8huL{!W>JZoHvR1-7gz?k+S++0 zC7bN=7KGA1%O~sk2m39UgDsmKws`Ks1(>aawa8k8SPoi2>k07M=bN(WH!p0X;0Ux$6`o7OrE*?efA4X)V z@HmJvB2FiV?kIACeZR70uvr&?=tW>LN}w%{X^DoZ=Rs@?0sLl7lEY zyPL&-i%HI&`|rz|r(p3vkQMk-)(<_wH&9_-MJz82p1yUx67NC7G2utrLp6BJ*bn=q z&wr+is{BFsLr+Zlfti-2UIQV*3v+2ET$-33Et=&U10U3QSjMR62~<~L89yDq-vEP< zMdHy+lds7lbS**2I0lN%1jYJlxc36HN-*TmEa*0CsD_j}e+3T0{sN?nak>e8HC%tr zpCs#_@it9Uw(I{P-l6{nUZNZcrakpfGrAZ`C-5O!fOC({7vCM0;D#kxLeFw&s#tI;y(j- zx0n6*xT8yd;QkpO15x7;#r(KP&G%aO{I1adhjh_5fAE86Un%z6qOjjVbYM5wuiIog zKiCx@4vRwcYOKaQhP)?#|U+N zPlGytYvAphAk4>Obn$R)+iou@7<7dQ8({%Trp*Q%qgXvcD{*l&P0lP4N;p|uz3NjxJ&rE<-_h&M|7g9Q{NLB> zjURf2(yyqaOamgm?X|2J2m15}M=XRN#Lvb0TR!n)md)PyC7RDiYGVUvgSZZAXYwT1p>fwyke-Pt=0@ z1u647;tqE4{h5oBHV2MAzGUP(1-blT%E0Lr9ikDUjThCQ2JNVAa8r;go*;+-^2`CU z9je{#UyM=cnp>=1!|K=cA94r?jNV=cd%;fY~BU|wZ8nZxd_|dwR;ocxCb!dvG z*=b-L){ETb9(|gl`aoPgG#2FN(xP6`XmIc^#ev^*#lbxz;vJ8Nii6)32PS_j4*oVI z-l5%~(R}}fLlMY^{wS2UlIS;vi_k{TAcGZ1)-Pz>x(rRiamT`*9BF&$WHoDEUx z6+JiwMnGC1MiW^U-ZmIB9GsiEk3jC#ySf0CYA4g&(6vwr&JGgY6p3ga#6d1QP;^s_ z(T;i0JZJmjI|Ns9X<95AEm=*<9l~5AV4dKu)hZY2Vp~{h#p)jAhH2O(T(ASGgcg@6 z4)Vp5jTC1N(ACt6wLFpyb+LX-k5%xL4&AevD6}F${=fo{r9cR8_2SqDi_1O4$cwGU zdG0|A^Avb#D8Qa2z%ppQ#=6*&?lTv}@V=KoSA&r2J3t`?FH*=AjShvG9>e(z(P%M? z*+Y%9*AsXVGwlx@P6TYvHH2A1U|*+Q-d`8HOYoqT4^r?r+|5IL7$8|I&YWd*V;@9d z7gsB=i}*jVd!7SAc&ir&AXa;dQ4Gj?uaFnRQ(*7>j@?EVJK24vs~h`79wFD)RUsvr z&@7iJ8l4N+KMAxI*x_rrU<85oT+z*PDea+z_Psar7+NY709}JXfHEaC&lX?Y*G(5Q zW3j<0a(RwIu4gh0^>)NcOoYgajm%_p$6(mAQ;a_H)HvthiZ)+-3;IDnOvt2z=1bvy zb3UM#fVQwA7IPSA>zzVd_EDfMp02@ zE#;AxVsCSeWSwX%UE~o#GZC$a(jnwXx5`b|Adxj%Z|I15nhKHKI(p26EaYs$DN)zS z?j?vmEp)`9qfoIq;?^k6(9g={NZ;1d8=|S&uwBHlp1l5|bdc^wj2WvLzm3DfXVCM7 z3piDUAlcMPo?$75^V>Nbd|nnZ*=;Ev%L%B&E7#%3yPt-jW*xmQ28-3?>;i&iSt#?E zrMLmkw=2#E4J05RKqI(3#4L~+No~bR(n{QEJ&q0x)DU>CCobmL_$1E|b0+&&;v5!T zejJH{y3fpzMbiVW$T2kmQ?|JJDt1;pxP^Yt*EL!Ib9xMu4eNpY&zeluv`?Fzybo{4 zLi&zalPQL>*3)A?6G}+-V|d!o>GF$~;vgFW3S9UZ2r7U;ya$=&S&EAl=dr(ud1cXs zwoVA7Ee*epoTL31g${H&Rsl0|ATE|aQmB6N#_y_^X7rT@s#}fPK#y5c#C)X|lka9} zDNf=Fon-_b_3s}0FHm`20I*h$}t>f={_|;ae4cmUyc!K#~Hy*oPaz;LF^se8Z+zz zxS8*!EBRiYV8KujXz7Ny5qwlj@8xU=Cb1QbBJ#7*D!crRKoQS3(8s4lZ>Ui~`h%$% zsPANheYF(ZA+f%$=r|vlG4v|>@Y~;;6+navm||=K4!=1d1pogQ-n`LP9dvnw6Fn`v z+m7_%bb4Cf21_r*<$hxluBxTZl5Uln_3SThuPyMpAM+cWgHFPaG0Q8Qn9s1@T8JSV zuO7vn>>cTPpm;k|7czyJT5+gMz&yc~3Rtv*96YLEI}Z83z?4YUtG>QRW=kjZbFi@B zT;<>%HgxGnBuTFuxhqj?^k$%fJB4NCh2M01wgf=&SAWh#l z4QdVR$}VPiy6Ok*6b%{_2J|R@vQ}?8IjE?_8}2O-!w*9UeiG#8x`?2;l7*KL4}TG_ z?X}d1j!UP2rKs{uqU3`hnFn$pGMid|SPOO1Z$Qt^gtd^?z+yG;8F&y25pfF34Mu)t zw~QZaIYpZQ45Gvc^x1t+Aer}GFBoL`VZ=91e_?_43a}8H&AR_rI3y}K3X7F+#0A0+ zK5BBzAgiH7_zjwF{-zZEba?#G1NeV$ZIGh1`O$I*6a5arz2X6N>WY+$v@AGKcj7%f zh~6Pj8V24s9`dnTk1&s{?)9Jv8o?SEEzql@`^9N;`eJCpPHCD`o>_?A6x**pNL&zMyHL1hQ(IR?V_~jrpQOf-F#NLWczn&J{)yG@*hQXRO z9;bN)GI58>5kZ)qnjKw?O61ZJXdbT7-K!GqZ;1BKPT0F1THD*7G4GOoZD%@dNBn>6 zy$f_x)!H_?RkS4IL|wi0`AYfzwi8G{Qnu}jKg4z zX3e?g{hf2Z@AE!$F6Td2wXeO|{Q5UCrxnGPn`KZ?@vrQBa`gwU_Awi7?uCAHGxn9& z!-q!h2a1~H-^d`_lf9BRiiT^AI0 z&Zp~stSx(gx@PtfbK9X#3PW!1LSy}OO^)+5@uYNroYQo~oK&!&$g2KWdr^-Uy5NO@ zcws(X_y{jl;)T2d{Vh55>&t&oo_#aPALD!YGiCM