From 990ff5f3950a933c10dc1f7a3f481b388bc9a47d Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 4 Mar 2025 16:36:59 +0000 Subject: [PATCH 01/32] Applying first 4 diffs using diff.diff --- lib/include/rocRoller/AssemblyKernel.hpp | 1 + lib/include/rocRoller/AssemblyKernel_impl.hpp | 9 + lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp | 38 ++- .../WidenTo64bit.cpp | 255 ++++++++++++++++++ 4 files changed, 293 insertions(+), 10 deletions(-) create mode 100644 lib/source/ExpressionTransformations/WidenTo64bit.cpp diff --git a/lib/include/rocRoller/AssemblyKernel.hpp b/lib/include/rocRoller/AssemblyKernel.hpp index ae5db25e..a945b1b9 100644 --- a/lib/include/rocRoller/AssemblyKernel.hpp +++ b/lib/include/rocRoller/AssemblyKernel.hpp @@ -128,6 +128,7 @@ namespace rocRoller * @param args Vector of CommandArgument pointers that should be added as arguments. */ void addCommandArguments(std::vector args); + void addNewCommandArguments(std::vector args); Expression::ExpressionPtr addCommandArgument(CommandArgumentPtr arg); std::string args_string(); diff --git a/lib/include/rocRoller/AssemblyKernel_impl.hpp b/lib/include/rocRoller/AssemblyKernel_impl.hpp index 5f721014..ec21f1f9 100644 --- a/lib/include/rocRoller/AssemblyKernel_impl.hpp +++ b/lib/include/rocRoller/AssemblyKernel_impl.hpp @@ -202,6 +202,15 @@ namespace rocRoller } } + inline void AssemblyKernel::addNewCommandArguments(std::vector args) + { + for(auto arg : args) + { + if(m_argumentNames.find(arg->name()) == m_argumentNames.end()) + addCommandArgument(arg); + } + } + inline Expression::ExpressionPtr AssemblyKernel::addCommandArgument(CommandArgumentPtr arg) { return addArgument({arg->name(), diff --git a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp index d4df2566..775498f3 100644 --- a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp +++ b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp @@ -1,5 +1,6 @@ #include #include +#include #include namespace rocRoller @@ -55,16 +56,33 @@ namespace rocRoller } else if(elementBits == 64u) { - co_yield_(Instruction("v_and_b32", - {dest->subset({0})}, - {lhs->subset({0}), rhs->subset({0})}, - {}, - "")); - co_yield_(Instruction("v_and_b32", - {dest->subset({1})}, - {lhs->subset({1}), rhs->subset({1})}, - {}, - "")); + if(lhs->regType() == Register::Type::Literal) + { + + Register::ValuePtr lsb; + Register::ValuePtr msb; + Arithmetic::get2LiteralDwords(lsb, msb, lhs); + + // subset() is not applicable to NoAllocation Literal type. + co_yield_(Instruction( + "v_and_b32", {dest->subset({0})}, {lsb, rhs->subset({0})}, {}, "")); + co_yield_(Instruction( + "v_and_b32", {dest->subset({1})}, {msb, rhs->subset({1})}, {}, "")); + } + else + { + + co_yield_(Instruction("v_and_b32", + {dest->subset({0})}, + {lhs->subset({0}), rhs->subset({0})}, + {}, + "")); + co_yield_(Instruction("v_and_b32", + {dest->subset({1})}, + {lhs->subset({1}), rhs->subset({1})}, + {}, + "")); + } } else { diff --git a/lib/source/ExpressionTransformations/WidenTo64bit.cpp b/lib/source/ExpressionTransformations/WidenTo64bit.cpp new file mode 100644 index 00000000..c9408769 --- /dev/null +++ b/lib/source/ExpressionTransformations/WidenTo64bit.cpp @@ -0,0 +1,255 @@ +#include +#include +#include + +template +constexpr auto cast_to_unsigned(T val) +{ + return static_cast::type>(val); +} + +namespace rocRoller +{ + namespace Expression + { + struct WidenTo64BitVisitor + { + + ExpressionPtr operator()(Convert const& expr) const + { + Convert cpy = expr; + if(expr.arg) + { + // Here is an assumption that call(cpy.arg) never goes above 64-bit and + // input convert's destination types are either int32, uint32, int64 or uint64. + // resultVaribleType(expr.arg) is not called intentionally as it will visit the + // subtree of expr.arg again. + // We can also import similar logic from ExpressionResultTypeVisitor and + // make the operator()(...) return a pair of ExpressionPtr and VariableType + // in order to avoid repeated visit of the subtree of cpy.arg. + cpy.arg = call(expr.arg); + if(expr.destinationType == DataType::UInt32) + return convert(DataType::UInt64, cpy.arg); + else if(expr.destinationType == DataType::Int32) + return convert(DataType::Int64, cpy.arg); + else if(expr.destinationType == DataType::UInt64 + || expr.destinationType == DataType::Int64) + return convert(expr.destinationType, cpy.arg); + else + { + AssertFatal( + false, + "Expected Destination type for a convert is either (u)int{32,64}"); + return nullptr; + } + } + + return std::make_shared(cpy); + } + + ExpressionPtr operator()(Negate const& expr) const + { + Negate cpy = expr; + if(expr.arg) + { + cpy.arg = call(expr.arg); + } + + return std::make_shared(cpy); + } + + template + ExpressionPtr operator()(Expr const& expr) const + { + // Only Convert and perhaps negates are expected. + AssertFatal(false, "Unexpected Unary expression", ShowValue(expr)); + return nullptr; + } + + ExpressionPtr operator()(Divide const& expr) const + { + // TODO: Add check that divisor is a CValue within (u)int32-bit + Log::debug("Divisor: {} ", toString(expr.rhs)); + + Divide cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + + return std::make_shared(cpy); + } + + ExpressionPtr operator()(Modulo const& expr) const + { + // TODO: Add check that divisor is a CValue within (u)int32-bit + Log::debug("Modulo: {} ", toString(expr.rhs)); + + Modulo cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + + return std::make_shared(cpy); + } + + // TODO: for some shifts, logical, subtract operations these may not be correct. + // Not sure yet if address calculation expressions with identity_transducer + // those types of Expression or not. + template + requires(CArithmetic) ExpressionPtr operator()(Expr const& expr) const + { + if constexpr(std::same_as) + { + AssertFatal(false, "Subtracts are not expected"); + return nullptr; + } + + if constexpr(CLogical) + { + AssertFatal(false, "logicals are not expected"); + return nullptr; + } + + Expr cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + if(expr.rhs) + cpy.rhs = call(expr.rhs); + + return std::make_shared(cpy); + } + + // Even with catch-all operator()(ExpressionPtr) without following, + // compilation fails. + template + requires(CBinary) ExpressionPtr operator()(Expr const& expr) const + { + AssertFatal(false, "Not expected expr : ", ShowValue(expr)); + return nullptr; + } + + template + requires(CArithmetic) ExpressionPtr operator()(Expr const& expr) const + { + Expr cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + if(expr.r1hs) + cpy.r1hs = call(expr.r1hs); + if(expr.r2hs) + cpy.r2hs = call(expr.r2hs); + + return std::make_shared(cpy); + } + + template + requires(CTernary) ExpressionPtr operator()(Expr const& expr) const + { + AssertFatal(false, "Not expected expr : ", ShowValue(expr)); + return nullptr; + } + + // leaves + ExpressionPtr operator()(CommandArgumentPtr const& expr) const + { + Log::debug("CommandArgumentPtr {}", toString(expr)); + + auto varType = expr->variableType(); + + assertIfNotExpectedType(varType.dataType, toString(expr)); + + CommandArgumentPtr cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + ExpressionPtr operator()(CommandArgumentValue const& expr) const + { + Log::debug("CommandArgumentValue {} Type {} ", + toString(expr), + toString(variableType(expr))); + + auto varType = variableType(expr); + + assertIfNotExpectedType(varType.dataType, toString(expr)); + + CommandArgumentValue cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + ExpressionPtr operator()(Register::ValuePtr const& expr) const + { + Log::debug("Register::ValuePtr {}", toString(expr)); + + auto varType = expr->variableType(); + + assertIfNotExpectedType(varType.dataType, toString(expr)); + + Register::ValuePtr cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const + { + Log::debug("AssemblyKernelArgumentPtr {} its expression is {}", + toString(expr), + toString(expr->expression)); + + auto varType = expr->variableType; + + assertIfNotExpectedType(varType.dataType, toString(expr)); + + AssemblyKernelArgumentPtr cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + // catch the rest CValue + template + ExpressionPtr operator()(Value const& expr) const + { + AssertFatal( + false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr)); + return nullptr; + } + + template + ExpressionPtr widenTo64(DataType srcType, T const& expr) const + { + if(srcType == DataType::UInt32) + return convert(DataType::UInt64, std::make_shared(expr)); + else if(srcType == DataType::Int32) + return convert(DataType::Int64, std::make_shared(expr)); + + return std::make_shared(expr); + } + + ExpressionPtr call(ExpressionPtr const& expr) const + { + return std::visit(*this, *expr); + } + + void assertIfNotExpectedType(DataType dt, std::string const& showValue) const + { + AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32 + || dt == DataType::UInt64 || dt == DataType::Int64, + "Unexpected DataType for Command/Kernel arguments or " + "workgroup/item indices ", + showValue); + } + }; + + ExpressionPtr widenTo64bit(ExpressionPtr expr) + { + auto origVarType = resultVariableType(expr); + + auto visitor = WidenTo64BitVisitor(); + auto widened = visitor.call(expr); + + auto finalVarType = resultVariableType(widened); + + AssertFatal(origVarType.dataType == finalVarType.dataType, + "Original and final data types should be the same", + ShowValue(origVarType.dataType), + ShowValue(finalVarType.dataType)); + + return widened; + } + } +} From 6f391efdbb4e53216fee43dee019ad9458e40c45 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 4 Mar 2025 18:11:36 +0000 Subject: [PATCH 02/32] Buildable minimal --- lib/CMakeLists.txt | 1 + lib/include/rocRoller/ExpressionTransformations.hpp | 10 ++++++++++ lib/source/ExpressionTransformations/WidenTo64bit.cpp | 7 +++++++ 3 files changed, 18 insertions(+) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 805b7c86..77e2dbe9 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -75,6 +75,7 @@ set(rocroller_src source/ExpressionTransformations/LowerPRNG.cpp source/ExpressionTransformations/RestoreCommandArguments.cpp source/ExpressionTransformations/Simplify.cpp + source/ExpressionTransformations/WidenTo64bit.cpp source/CodeGen/AddInstruction.cpp source/CodeGen/ArgumentLoader.cpp diff --git a/lib/include/rocRoller/ExpressionTransformations.hpp b/lib/include/rocRoller/ExpressionTransformations.hpp index e4a730ec..55a3a07f 100644 --- a/lib/include/rocRoller/ExpressionTransformations.hpp +++ b/lib/include/rocRoller/ExpressionTransformations.hpp @@ -135,5 +135,15 @@ namespace rocRoller * @return ExpressionPtr Transformed expression */ ExpressionPtr lowerBitfieldValues(ExpressionPtr expr); + + /** + * @brief Widen (u)int32 to (u)int64. + * + * Has many assumptions in input expr. See the implementation for details. + * + * @param expr Input expression + * @return ExpressionPtr Transformed expression + */ + ExpressionPtr widenTo64bit(ExpressionPtr expr); } } diff --git a/lib/source/ExpressionTransformations/WidenTo64bit.cpp b/lib/source/ExpressionTransformations/WidenTo64bit.cpp index c9408769..f43b07d4 100644 --- a/lib/source/ExpressionTransformations/WidenTo64bit.cpp +++ b/lib/source/ExpressionTransformations/WidenTo64bit.cpp @@ -209,6 +209,13 @@ namespace rocRoller return nullptr; } + ExpressionPtr operator()(Expression const& expr) const + { + AssertFatal( + false, "No expectation to meet this type of Expression: ", ShowValue(expr)); + return nullptr; + } + template ExpressionPtr widenTo64(DataType srcType, T const& expr) const { From 573b3b04045d772ec842a4b1bcbd9009ba365036 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 4 Mar 2025 18:49:00 +0000 Subject: [PATCH 03/32] Adding changes on CommandSolution - buildable --- lib/include/rocRoller/CommandSolution.hpp | 11 +++++++ lib/source/CommandSolution.cpp | 40 +++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/lib/include/rocRoller/CommandSolution.hpp b/lib/include/rocRoller/CommandSolution.hpp index 9321bc74..d5ecd1dc 100644 --- a/lib/include/rocRoller/CommandSolution.hpp +++ b/lib/include/rocRoller/CommandSolution.hpp @@ -167,6 +167,17 @@ namespace rocRoller */ void generateKernel(); + /** + * @brief Generates the kernel Graph by graph lowering and + * doesn't do code-generation. + */ + void generateKernelGraphOnlyAfterTransforms(); + + /** + * @brief Lower command arguments to kernel arguments. + */ + void lowerToKernelArguments(); + /** * @brief Assembles a generated kernel. Does not try to load * it. diff --git a/lib/source/CommandSolution.cpp b/lib/source/CommandSolution.cpp index de5aae79..679002f9 100644 --- a/lib/source/CommandSolution.cpp +++ b/lib/source/CommandSolution.cpp @@ -104,6 +104,8 @@ namespace rocRoller rv.reserve(m_context->kernel()->argumentSize(), argStructs.size()); + Log::debug("== getKernelArguments =="); + for(auto& arg : argStructs) { auto value = Expression::evaluate(arg.expression, args); @@ -120,6 +122,8 @@ namespace rocRoller arg.name)); } + Log::debug(" arg.name {} value {}", arg.name, toString(value)); + rv.append(arg.name, value); } @@ -136,11 +140,21 @@ namespace rocRoller auto const& workitems = m_context->kernel()->workitemCount(); if(workitems[0]) + { rv.workitemCount[0] = getUnsignedInt(evaluate(workitems[0], args)); + Log::debug("== getKernelInvocation =="); + Log::debug(" workitemCount[0] {}", rv.workitemCount[0]); + } if(workitems[1]) + { rv.workitemCount[1] = getUnsignedInt(evaluate(workitems[1], args)); + Log::debug(" workitemCount[1] {}", rv.workitemCount[1]); + } if(workitems[2]) + { rv.workitemCount[2] = getUnsignedInt(evaluate(workitems[2], args)); + Log::debug(" workitemCount[2] {}", rv.workitemCount[2]); + } auto const& sharedMem = m_context->kernel()->dynamicSharedMemBytes(); if(sharedMem) @@ -186,6 +200,15 @@ namespace rocRoller co_yield Instruction::Comment(m_command->argInfo()); } + void CommandKernel::lowerToKernelArguments() + { + for(auto arg : m_command->getArguments()) + { + Log::debug("command argument: {}, {}", arg->toString(), toString(arg->expression())); + } + m_context->kernel()->addNewCommandArguments(m_command->getArguments()); + } + void CommandKernel::generateKernelGraph(std::string name) { TIMER(t, "CommandKernel::generateKernelGraph"); @@ -362,6 +385,23 @@ namespace rocRoller generateKernelSource(); } else + { + Log::debug("generateKernel() is doing nothing"); + // Probably from a unit test. The context should contain + // scheduled instructions already. + } + } + + void CommandKernel::generateKernelGraphOnlyAfterTransforms() + { + TIMER(t, "CommandKernel::generateKernelGraphOnlyAfterTransforms()"); + + if(m_command) + { + // Only lower the KernelGraph and don't generate codes. + generateKernelGraph(m_name); + } + else { // Probably from a unit test. The context should contain // scheduled instructions already. From f18cfbff91389ee3cdc95cf8beb7153c011ff5bf Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 4 Mar 2025 18:56:19 +0000 Subject: [PATCH 04/32] Adding default <=> operator to GEMMProblem - buildable --- test/common/common/GEMMProblem.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/common/common/GEMMProblem.hpp b/test/common/common/GEMMProblem.hpp index 5d6fe552..764b9056 100644 --- a/test/common/common/GEMMProblem.hpp +++ b/test/common/common/GEMMProblem.hpp @@ -66,4 +66,6 @@ struct GEMMProblem rocRoller::Operations::ScaleMode scaleAMode = rocRoller::Operations::ScaleMode::None; rocRoller::Operations::ScaleMode scaleBMode = rocRoller::Operations::ScaleMode::None; + + auto operator<=>(GEMMProblem const& rhs) const = default; }; From b7baa4a6c7ed38bd4df1923d6c4676eed1b8d546 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 4 Mar 2025 22:02:06 +0000 Subject: [PATCH 05/32] Before adding AddressCalculationTest.cpp - buildable --- test/catch/CommandTest.cpp | 8 +- test/catch/IdentifyParallelDimensionsTest.cpp | 4 +- test/common/CommonGraphs.cpp | 239 ++++++++++++++++++ test/common/common/CommonGraphs.hpp | 24 +- test/common/common/CommonGraphs_impl.hpp | 2 + test/unit/KernelGraphTest.cpp | 6 +- .../unit/KernelGraphTest/UpdateParameters.cpp | 2 +- 7 files changed, 272 insertions(+), 13 deletions(-) diff --git a/test/catch/CommandTest.cpp b/test/catch/CommandTest.cpp index b2e721a0..b0bb4ae4 100644 --- a/test/catch/CommandTest.cpp +++ b/test/catch/CommandTest.cpp @@ -13,9 +13,9 @@ namespace CommandTest { SECTION("GEMM/TileAdd") { - auto example1 = *rocRollerTest::Graphs::GEMM().getCommand(); - auto example2 = *rocRollerTest::Graphs::GEMM().getCommand(); - auto example3 = *rocRollerTest::Graphs::GEMM().getCommand(); + auto example1 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand(); + auto example2 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand(); + auto example3 = *rocRollerTest::Graphs::GEMM(DataType::Half).getCommand(); auto example4 = *rocRollerTest::Graphs::TileDoubleAdd().getCommand(); CHECK(example1 == example2); @@ -33,7 +33,7 @@ namespace CommandTest { SECTION("GEMM") { - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); auto command0 = example.getCommand(); auto yaml = Command::toYAML(*command0); diff --git a/test/catch/IdentifyParallelDimensionsTest.cpp b/test/catch/IdentifyParallelDimensionsTest.cpp index 5c673572..25c1d378 100644 --- a/test/catch/IdentifyParallelDimensionsTest.cpp +++ b/test/catch/IdentifyParallelDimensionsTest.cpp @@ -48,7 +48,7 @@ TEST_CASE("identifyParallelDimensionSets works for GEMM", "[kernel-graph]") using namespace rocRoller; auto ctx = TestContext::ForDefaultTarget(); - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); auto kgraph = KernelGraph::translate(example.getCommand()); @@ -149,7 +149,7 @@ SCENARIO("IdentifyParallelDimensions transformation works for GEMM", "[kernel-gr using namespace rocRoller; auto ctx = TestContext::ForDefaultTarget(); - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); GIVEN("The initial kernel graph for a GEMM") { diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp index 338d1a10..294b37f4 100644 --- a/test/common/CommonGraphs.cpp +++ b/test/common/CommonGraphs.cpp @@ -169,4 +169,243 @@ namespace rocRollerTest::Graphs return params; } + GEMM::GEMM(DataType ta) + : GEMM(ta, ta) + { + } + GEMM::GEMM(DataType ta, DataType tb) + : GEMM(ta, tb, tb) + { + } + GEMM::GEMM(DataType ta, DataType tb, DataType tc) + : GEMM(ta, tb, tc, tc) + { + } + GEMM::GEMM(DataType ta, DataType tb, DataType tc, DataType td) + : m_ta(ta) + , m_tb(tb) + , m_tc(tc) + , m_td(td) + { + } + + void GEMM::createCommand() + { + m_command = std::make_shared(); + + std::vector oneStridesN + = m_problem.literalStrides ? std::vector({(size_t)1}) : std::vector({}); + + std::vector oneStridesT = m_problem.literalStrides + ? std::vector({(size_t)0, (size_t)1}) + : std::vector({}); + + m_tagTensorA = m_command->addOperation(rocRoller::Operations::Tensor( + 2, m_ta, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A + + m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorA)); + + m_tagTensorB = m_command->addOperation(rocRoller::Operations::Tensor( + 2, m_tb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B + m_tagB = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorB)); + + m_tagTensorC + = m_command->addOperation(rocRoller::Operations::Tensor(2, m_tc, oneStridesN)); // C + m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorC)); + + m_tagScalarAlpha + = m_command->addOperation(rocRoller::Operations::Scalar(DataType::Float)); // alpha + auto tagLoadAlpha + = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(m_tagScalarAlpha)); + + m_tagScalarBeta = m_command->addOperation(rocRoller::Operations::Scalar(m_tc)); // beta + auto tagLoadBeta = m_command->addOperation( + rocRoller::Operations::T_Load_Scalar(m_tagScalarBeta)); // beta + + auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B + + rocRoller::Operations::T_Execute execute(m_command->getNextTag()); + + auto tagBetaC + = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadBeta, m_tagC)); // beta * C + + auto tagAlphaAB + = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadAlpha, tagAB)); // alpha * (A * B) + + if(m_problem.betaInFma) + { + m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagBetaC, tagAlphaAB)); + // alpha * (A * B) + beta * C + } + else + { + m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagAlphaAB, tagBetaC)); + // alpha * (A * B) + beta * C + } + m_command->addOperation(std::move(execute)); + + m_tagTensorD + = m_command->addOperation(rocRoller::Operations::Tensor(2, m_td, oneStridesN)); // D + m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, m_tagTensorD)); // D + + if(m_problem.streamK) + { + m_tagNumWGs = m_command->allocateTag(); + auto numWGsArg = m_command->allocateArgument(DataType::UInt32, + m_tagNumWGs, + ArgumentType::Value, + DataDirection::ReadOnly, + rocRoller::NUMWGS); + } + + m_tagScratch = m_command->allocateTag(); + m_command->allocateArgument(VariableType(DataType::UInt32, PointerType::PointerGlobal), + m_tagScratch, + ArgumentType::Value, + DataDirection::ReadWrite, + rocRoller::SCRATCH); + } + + CommandPtr GEMM::getCommand() + { + if(!m_command) + createCommand(); + + return m_command; + } + + KernelGraph GEMM::getKernelGraph() + { + return rocRoller::KernelGraph::translate(getCommand()); + } + + void GEMM::setTileSize(int m, int n, int k) + { + m_problem.macM = m; + m_problem.macN = n; + m_problem.macK = k; + } + + void GEMM::setMFMA(int m, int n, int k, int b) + { + m_problem.waveM = m; + m_problem.waveN = n; + m_problem.waveK = k; + m_problem.waveB = b; + } + + void GEMM::setUseLDS(bool a, bool b, bool d) + { + m_problem.loadLDSA = a; + m_problem.loadLDSB = b; + m_problem.storeLDSD = d; + } + + void GEMM::setPrefetch(bool prefetch, + int prefetchInFlight, + int prefetchLDSFactor, + bool prefetchMixMemOps) + { + m_problem.prefetch = prefetch; + m_problem.prefetchInFlight = prefetchInFlight; + m_problem.prefetchLDSFactor = prefetchLDSFactor; + m_problem.prefetchMixMemOps = prefetchMixMemOps; + + m_problem.unrollK = prefetchInFlight; + } + + void GEMM::setProblem(GEMMProblem const& problem) + { + m_problem = problem; + } + + GEMMProblem const& GEMM::getProblem() const + { + return m_problem; + } + + CommandParametersPtr GEMM::getCommandParameters() const + { + using namespace rocRoller::KernelGraph::CoordinateGraph; + + auto params = std::make_shared(); + + params->setManualKernelDimension(2); + + AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0, + "Workgroup Size X must be multiply of wave front size"); + + uint wavetilePerWavefrontM + = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX; + uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY; + + AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0, + "WaveTile size mismatch (M)"); + AssertFatal(m_problem.macN % (m_problem.waveN * wavetilePerWavefrontN) == 0, + "WaveTile size mismatch (N)"); + + uint workgroupSizeX = m_problem.workgroupSizeX * m_problem.workgroupSizeY; + uint workgroupSizeY = 1; + params->setManualWorkgroupSize({workgroupSizeX, workgroupSizeY, 1}); + + auto macTileA + = MacroTile({m_problem.macM, m_problem.macK}, + LayoutType::MATRIX_A, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}, + m_problem.loadLDSA ? MemoryType::LDS : MemoryType::WAVE); + auto macTileB + = MacroTile({m_problem.macK, m_problem.macN}, + LayoutType::MATRIX_B, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}, + m_problem.loadLDSB ? MemoryType::LDS : MemoryType::WAVE); + auto macTileC + = MacroTile({m_problem.macM, m_problem.macN}, + LayoutType::MATRIX_ACCUMULATOR, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}); + auto macTileD + = MacroTile({m_problem.macM, m_problem.macN}, + LayoutType::MATRIX_ACCUMULATOR, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}, + m_problem.storeLDSD ? MemoryType::LDS : MemoryType::WAVE); + + params->setDimensionInfo(m_tagA, macTileA); + params->setDimensionInfo(m_tagB, macTileB); + params->setDimensionInfo(m_tagC, macTileC); + params->setDimensionInfo(m_tagD, macTileD); + + // uint jammedM + // = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX; + // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY; + + Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1); + // Log::debug("GEMM jamming {} {}", jammedM, jammedN); + // params->setWaveTilesPerWavefront(jammedM, jammedN); + + params->setManualWavefrontCount( + {static_cast(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM), + static_cast(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)}); + + params->fuseLoops = m_problem.fuseLoops; + params->tailLoops = m_problem.tailLoops; + params->allowAmbiguousMemoryNodes = m_problem.allowAmbiguousMemoryNodes; + params->unrollK = m_problem.unrollK; + params->packMultipleElementsInto1VGPR = m_problem.packMultipleElementsInto1VGPR; + params->prefetch = m_problem.prefetch; + params->prefetchInFlight = m_problem.prefetchInFlight; + params->prefetchLDSFactor = m_problem.prefetchLDSFactor; + params->prefetchMixMemOps = m_problem.prefetchMixMemOps; + params->transposeMemoryAccess[LayoutType::MATRIX_A] = m_problem.transA == "T"; + params->transposeMemoryAccess[LayoutType::MATRIX_B] = m_problem.transB == "T"; + params->transposeMemoryAccess[LayoutType::None] = true; + + if(m_problem.streamK) + { + params->loopOverOutputTilesDimensions = {0, 1}; + params->streamK = true; + params->streamKTwoTile = m_problem.streamKTwoTile; + } + + return params; + } + } diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp index 5dd94935..7485c5cc 100644 --- a/test/common/common/CommonGraphs.hpp +++ b/test/common/common/CommonGraphs.hpp @@ -14,6 +14,8 @@ #include #include +#include + namespace rocRollerTest { namespace Graphs @@ -26,6 +28,7 @@ namespace rocRollerTest using ContextPtr = rocRoller::ContextPtr; using KernelArguments = rocRoller::KernelArguments; using KernelGraph = rocRoller::KernelGraph::KernelGraph; + using DataType = rocRoller::DataType; /** * @brief Graph for linear: alpha x + beta y. @@ -152,11 +155,13 @@ namespace rocRollerTest * - Assign(D = alpha * AB + beta * C) * - StoreTiled(D) */ - template class GEMM { public: - GEMM(); + GEMM(DataType ta); + GEMM(DataType ta, DataType tb); + GEMM(DataType ta, DataType tb, DataType tc); + GEMM(DataType ta, DataType tb, DataType tc, DataType td); CommandPtr getCommand(); KernelGraph getKernelGraph(); @@ -164,9 +169,21 @@ namespace rocRollerTest void setTileSize(int m, int n, int k); void setMFMA(int m, int n, int k, int b); void setUseLDS(bool a, bool b, bool d); + void setPrefetch(bool prefetch, + int prefetchInFlight, + int prefetchLDSFactor, + bool prefetchMixMemOps); + void setProblem(GEMMProblem const& problem); + GEMMProblem const& getProblem() const; CommandParametersPtr getCommandParameters() const; + rocRoller::Operations::OperationTag m_tagTensorA, m_tagTensorB, m_tagTensorC, + m_tagTensorD, m_tagScalarAlpha, m_tagScalarBeta, m_tagScalarSeed, m_tagScratch; + rocRoller::Operations::OperationTag m_tagNumWGs; + + DataType m_ta, m_tb, m_tc, m_td; + private: void createCommand(); @@ -176,7 +193,8 @@ namespace rocRollerTest rocRoller::Operations::OperationTag m_tagA, m_tagB, m_tagC, m_tagD; - CommandPtr m_command; + CommandPtr m_command; + GEMMProblem m_problem; }; /** diff --git a/test/common/common/CommonGraphs_impl.hpp b/test/common/common/CommonGraphs_impl.hpp index 22776c4f..a6232cc8 100644 --- a/test/common/common/CommonGraphs_impl.hpp +++ b/test/common/common/CommonGraphs_impl.hpp @@ -231,6 +231,7 @@ namespace rocRollerTest::Graphs return rocRoller::KernelGraph::translate(m_command); } +#if 0 /* * GEMM */ @@ -356,6 +357,7 @@ namespace rocRollerTest::Graphs return params; } +#endif /* * TileDoubleAdd diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp index 30e60e3b..e3c28ee4 100644 --- a/test/unit/KernelGraphTest.cpp +++ b/test/unit/KernelGraphTest.cpp @@ -1060,7 +1060,7 @@ namespace KernelGraphTest TEST_F(KernelGraphTest, LowerTensor) { - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); int macK = 16; int waveK = 8; @@ -1152,7 +1152,7 @@ namespace KernelGraphTest TEST_F(KernelGraphTest, InlineIncrement) { - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); example.setTileSize(128, 256, 8); example.setMFMA(32, 32, 2, 1); @@ -2704,7 +2704,7 @@ namespace KernelGraphTest { using GD = Graph::Direction; - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); example.setTileSize(128, 256, 8); example.setMFMA(32, 32, 2, 1); diff --git a/test/unit/KernelGraphTest/UpdateParameters.cpp b/test/unit/KernelGraphTest/UpdateParameters.cpp index 80e78b8d..cc7e6a98 100644 --- a/test/unit/KernelGraphTest/UpdateParameters.cpp +++ b/test/unit/KernelGraphTest/UpdateParameters.cpp @@ -62,7 +62,7 @@ namespace KernelGraphTest { using namespace rocRoller::KernelGraph; - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); int macK = 16; int waveK = 8; From 409496e7991060cbfad818cc301628c310511e7d Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 4 Mar 2025 22:27:37 +0000 Subject: [PATCH 06/32] Adding AddressCalculationTest.cpp - buildable/testable --- test/CMakeLists.txt | 1 + test/catch/AddressCalculationTest.cpp | 900 ++++++++++++++++++++++++++ 2 files changed, 901 insertions(+) create mode 100644 test/catch/AddressCalculationTest.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0000cecd..2caf0905 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -168,6 +168,7 @@ set( catch/CustomAssertions.cpp catch/TestKernels.cpp + catch/AddressCalculationTest.cpp catch/BinaryExpressionTest.cpp catch/CommandArgumentValueTest.cpp catch/CommandTest.cpp diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp new file mode 100644 index 00000000..c60e6494 --- /dev/null +++ b/test/catch/AddressCalculationTest.cpp @@ -0,0 +1,900 @@ + +#include "CustomMatchers.hpp" +#include "TestContext.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace rocRoller; + +namespace AddressCalculationTest +{ + namespace KernelG = rocRoller::KernelGraph; + namespace ControlG = rocRoller::KernelGraph::ControlGraph; + namespace CoordG = rocRoller::KernelGraph::CoordinateGraph; + using KernelGraphType = typename rocRoller::KernelGraph::KernelGraph; + + class AddressTrace + { + public: + AddressTrace(KernelGraphType const& graph, ContextPtr ctx) + : m_kGraph(graph) + , m_context(ctx){}; + std::vector traceComputeIndexWithBuffer(); + + private: + KernelGraphType m_kGraph; + ContextPtr m_context; + }; + + std::vector AddressTrace::traceComputeIndexWithBuffer() + { + auto isComputeIndex = [&](int tag) { + return isOperation(this->m_kGraph.control.getElement(tag)); + }; + + std::vector rv; + auto root = m_kGraph.control.roots().only(); + int count = 0; + for(auto ciTag : filter(isComputeIndex, m_kGraph.control.depthFirstVisit(root.value()))) + { + + auto maybeCi = m_kGraph.control.get(ciTag); + AssertFatal(maybeCi.has_value()); + auto ci = maybeCi.value(); + + auto buffer + = m_kGraph.mapper.get(ciTag, + KernelG::Connections::ComputeIndex{ + KernelG::Connections::ComputeIndexArgument::BUFFER}); + if(buffer == -1) + continue; + + auto base = m_kGraph.mapper.get(ciTag, + KernelG::Connections::ComputeIndex{ + KernelG::Connections::ComputeIndexArgument::BASE}); + + // Currently, only base < 0 case is being covered. + if(base >= 0) + continue; + + { + // Debugging log + if(m_context->kernel()) + Log::debug("kernel is non-null \n"); + + auto const& kernelWorkgroupIndices = m_context->kernel()->workgroupIndex(); + Log::debug("Size of kernelWorkGroupIndices: {}", kernelWorkgroupIndices.size()); + } + + auto offset + = m_kGraph.mapper.get(ciTag, + KernelG::Connections::ComputeIndex{ + KernelG::Connections::ComputeIndexArgument::OFFSET}); + auto stride + = m_kGraph.mapper.get(ciTag, + KernelG::Connections::ComputeIndex{ + KernelG::Connections::ComputeIndexArgument::STRIDE}); + auto target + = m_kGraph.mapper.get(ciTag, + KernelG::Connections::ComputeIndex{ + KernelG::Connections::ComputeIndexArgument::TARGET}); + auto increment + = m_kGraph.mapper.get(ciTag, + KernelG::Connections::ComputeIndex{ + KernelG::Connections::ComputeIndexArgument::INCREMENT}); + + // Note that identity_transduer is intentionally being used here in place of FastArithmetic. + // It is alright to use FastArithmetic here but + // fastArithmetic is being used eventually when the expressions are generated. + auto identity_transducer = [&](auto expr) { return expr; }; + auto coords = CoordG::Transformer( + std::make_shared( + m_kGraph.coordinates), + m_context, + identity_transducer); + + auto fullStop = [&](int tag) { return tag == increment; }; + auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream; + auto [required, path] = findRequiredCoordinates(target, direction, fullStop, m_kGraph); + + for(auto tag : required) + if((tag != increment) && (!coords.hasCoordinate(tag))) + coords.setCoordinate(tag, Expression::literal(0u)); + + // Set the increment coordinate to zero if it doesn't + // already have a value + bool initializeIncrement = !coords.hasPath({target}, ci.forward); + if(initializeIncrement) + { + coords.setCoordinate(increment, Expression::literal(0u)); + } + + // Compute an offset address if we don't have an + // associated base address to inherit from + { + // base < 0 by the time control reacheds here. + auto indexExpr + = ci.forward ? coords.forward({target})[0] : coords.reverse({target})[0]; + + rv.push_back(indexExpr); + + // Rests are for logging for debugging. + Log::debug("ci.forward for tag {} dir {}, stride {}, buffer {}", + ciTag, + ci.forward, + stride, + buffer); + Log::debug("IndexExpr base < 0 for tag {} in the original graph {}", + ciTag, + toString(indexExpr)); + + // Buffer Descriptor's base + auto user = m_kGraph.coordinates.get(target); + if(user) + { + Log::debug("User for tag {} {}, offset {}", + ciTag, + ShowValue(target), + (user->offset ? "yes" : "no")); + Log::debug("argument name {}, real name {}", + user->argumentName, + m_context->kernel()->findArgument(user->argumentName).name); + + // 1. user->argumentName + user->offset if any --> set to base of BufferDesc + // (Expression) + // 2. user->size --> set to size field (32-bit) of BuferDesc + } + } + // book-keeping for debugging purpose + count++; + } + + Log::debug("Count of computeIndex investigated: {}", count); + + return rv; + } + + struct AddressCalculationTest + { + /** + * gemmGraph should be a graph initialized by prob + */ + AddressCalculationTest(rocRoller::ContextPtr context, + GEMMProblem const& prob, + rocRollerTest::Graphs::GEMM gemmGraph) + : m_context(context) + , m_problem(prob) + , m_gemmGraph(gemmGraph) + { + } + + bool check_uint32_overflow(uint a, uint b) + { + uint64_t prod = static_cast(a) * b; + return prod > static_cast(std::numeric_limits::max()); + } + + std::pair, uint> + getWorkItemCount(CommandParametersPtr params, GEMMProblem const& problem) + { + int M = problem.m; + int N = problem.n; + int K = problem.k; + + AssertFatal(M > 0 && N > 0 && K > 0); + + auto workGroupSizes = params->getManualWorkgroupSize(); + + AssertFatal(workGroupSizes.has_value()); + auto workgroupSizeX = workGroupSizes.value()[0]; + auto workgroupSizeY = workGroupSizes.value()[1]; + + // compute NumWorkGroups + uint numWorkgroupX; + uint numWorkgroupY; + + if(problem.loopOverTiles > 0) + { + // multiple output macro tiles per workgroup + numWorkgroupX = M * N / problem.macM / problem.macN / 2; + numWorkgroupY = 1; + } + else if(problem.streamK) + { + numWorkgroupX = problem.numWGs; + numWorkgroupY = 1; + } + else + { + // one output macro tile per workgroup + numWorkgroupX = M / problem.macM; + numWorkgroupY = N / problem.macN; + } + + AssertFatal(!check_uint32_overflow(numWorkgroupX, workgroupSizeX)); + AssertFatal(!check_uint32_overflow(numWorkgroupY, workgroupSizeY)); + + auto NX_literal = numWorkgroupX * workgroupSizeX; + auto NY_literal = numWorkgroupY * workgroupSizeY; + + auto NX = std::make_shared(NX_literal); + auto NY = std::make_shared(NY_literal); + auto NZ = std::make_shared(1u); + + auto totalWorkitemCounts = NX_literal * NY_literal; + { + Log::debug("Calculated workitemcount[0]: {}", toString(NX)); + Log::debug("Calculated workitemcount[1]: {}", toString(NY)); + Log::debug("Calculated workitemcount[2]: {}", toString(NZ)); + Log::debug("totalWorkitemCounts: {}", totalWorkitemCounts); + } + + return {{NX, NY, NZ}, totalWorkitemCounts}; + } + + static Expression::ExpressionPtr + get64BitVectorOffset(ContextPtr context, + std::array const& workitemCount, + std::array const& workgroupSize) + { + std::array thread_index; + for(int i = 0; i < 3; i++) + thread_index[i] = std::make_shared( + context->kernel()->workitemIndex()[i]); + + std::array workgroup_index; + for(int i = 0; i < 3; i++) + workgroup_index[i] = std::make_shared( + context->kernel()->workgroupIndex()[i]); + + std::array workgroup_size; + for(int i = 0; i < 3; i++) + workgroup_size[i] = std::make_shared( + Register::Value::Literal(workgroupSize[i])); + + auto idx_x = thread_index[0] + workgroup_index[0] * workgroup_size[0]; + auto idx_y = thread_index[1] + workgroup_index[1] * workgroup_size[1]; + + auto compare_res_pointer = idx_x + idx_y * workitemCount[0]; + auto elementSize = std::make_shared( + Register::Value::Literal(sizeof(uint64_t))); + compare_res_pointer = compare_res_pointer * elementSize; + + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + return compare_res_pointer; + } + + // This is for printing out workgroup index and thread index. + auto kb_sanity_indices(ContextPtr context, + std::array const& workitemCount, + std::array const& workgroupSize) + { + return [context, workitemCount, workgroupSize]() -> Generator { + // store base addr + Register::ValuePtr s_ptr; + co_yield context->argLoader()->getValue("rv_ptr", s_ptr); + + Register::ValuePtr s_ptr2; + co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); + + auto compare_res_pointer + = get64BitVectorOffset(context, workitemCount, workgroupSize); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset_1 = nullptr; + co_yield Expression::generate( + v_offset_1, compare_res_pointer + s_ptr->expression(), context); + + Register::ValuePtr v_offset_2 = nullptr; + co_yield Expression::generate( + v_offset_2, compare_res_pointer + s_ptr2->expression(), context); + + // workgroupIndex x + auto v_wg_x = Register::Value::Placeholder( + context, Register::Type::Vector, DataType::UInt32, 1); + co_yield context->copier()->copy( + v_wg_x, context->kernel()->workgroupIndex()[0], "copy wgi.x to v"); + co_yield context->mem()->storeGlobal(v_offset_1, v_wg_x, 0, 4); + + co_yield context->mem()->storeGlobal( + v_offset_2, (context->kernel()->workitemIndex())[0], 0, 4); + }; + } + + // Just print out passed expressions to device memory. + // If passed expressions are workitemCounts, the expectation is that + // the generated expressions' values are the same with the host-side computed values. + auto kb_implicit_workitemcount(ContextPtr context, + Expression::ExpressionPtr const& workitemcount_X, + Expression::ExpressionPtr const& workitemcount_Y, + std::array const& workgroupSize) + { + return [context, + workitemcount_X, + workitemcount_Y, + workgroupSize]() -> Generator { + // store base addrs + Register::ValuePtr s_ptr; + co_yield context->argLoader()->getValue("rv_ptr", s_ptr); + Register::ValuePtr s_ptr2; + co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); + auto compare_res_pointer = get64BitVectorOffset( + context, {workitemcount_X, workitemcount_Y}, workgroupSize); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset_1 = nullptr; + co_yield Expression::generate( + v_offset_1, compare_res_pointer + s_ptr->expression(), context); + + Register::ValuePtr v_offset_2 = nullptr; + co_yield Expression::generate( + v_offset_2, compare_res_pointer + s_ptr2->expression(), context); + + // Compute the value 1 + Register::ValuePtr s_value_1 = nullptr; + co_yield Expression::generate(s_value_1, workitemcount_X, context); + auto v_value_11 = Register::Value::Placeholder( + context, Register::Type::Vector, DataType::Int64, 1); + co_yield context->copier()->copy(v_value_11, s_value_1, "copy to v1"); + + // Compute the value 2 + Register::ValuePtr s_value_2 = nullptr; + co_yield Expression::generate(s_value_2, workitemcount_Y, context); + auto v_value_22 = Register::Value::Placeholder( + context, Register::Type::Vector, DataType::Int64, 1); + co_yield context->copier()->copy(v_value_22, s_value_2, "copy to v2"); + + co_yield context->mem()->storeGlobal(v_offset_1, v_value_11, 0, 8); + co_yield context->mem()->storeGlobal(v_offset_2, v_value_22, 0, 8); + }; + } + + auto kb_equal(ContextPtr context, + const std::vector& input, + const std::vector& widenedInput, + std::array const& workitemCount, + std::array const& workgroupSize) + { + auto allone_uint64 + = std::make_shared(static_cast(0xFFFFFFFF)); + + return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]() + -> Generator { + // store base addr + Register::ValuePtr s_ptr; + + co_yield context->argLoader()->getValue("rv_ptr", s_ptr); + + Register::ValuePtr s_ptr2; + + co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); + + // 2-D + auto compare_res_pointer + = get64BitVectorOffset(context, workitemCount, workgroupSize); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset = nullptr; + co_yield Expression::generate( + v_offset, compare_res_pointer + s_ptr->expression(), context); + + // may not be needed. + //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug"))); + + // boolean_diff was allocated to s[0:1] + // diff should be computed per lane, + // but is_zero_diff is actually one-bit + auto boolean_true = Register::Value::WavefrontPlaceholder(context); + Register::ValuePtr v_allone; + co_yield Expression::generate(v_allone, allone_uint64, context); + + co_yield context->copier()->copy( + boolean_true, v_allone, "set to true for all lanes"); + + Register::ValuePtr temp_res; + for(int i = 0, size = input.size(); i < size; i++) + { + // Compute the value 1 + Register::ValuePtr v_value_1 = nullptr; + co_yield Expression::generate(v_value_1, input[i], context); + + // Compute the value 2 + Register::ValuePtr v_value_2 = nullptr; + co_yield Expression::generate(v_value_2, widenedInput[i], context); + + // Compute diff (value_1 - value_2) + auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); + { + auto boolType = resultVariableType(is_zero_diff).dataType; + AssertFatal( + boolType == DataType::Bool64, "is_zero type {}", toString(boolType)); + } + + // boolean_true = boolean_true | is_zero_diff + auto accumRes = std::make_shared( + Expression::BitwiseAnd{boolean_true->expression(), is_zero_diff, "accum"}); + { + auto accumType = resultVariableType(accumRes).dataType; + AssertFatal( + accumType == DataType::Bool64, "accum type {}", toString(accumType)); + } + co_yield Expression::generate(boolean_true, accumRes, context); + } + + auto v_value = Register::Value::Placeholder( + context, Register::Type::Vector, DataType::UInt64, 1); + co_yield context->copier()->copy(v_value, boolean_true, "Move value"); + co_yield context->mem()->storeGlobal(v_offset, v_value, 0, 8); + }; + } + + // For now, mainly for debugging, directly copy the two results values. + // workitemCount is passed as argument. Since it is 64-bit + // different global_store_dwordx2 format should be used. + auto kb_equal_one(ContextPtr context, + Expression::ExpressionPtr const& input, + Expression::ExpressionPtr const& widenedInput, + std::array const& workitemCount, + std::array const& workgroupSize) + { + return [context, input, widenedInput, workitemCount, workgroupSize]() + -> Generator { + // store base addr + Register::ValuePtr s_ptr; + co_yield context->argLoader()->getValue("rv_ptr", s_ptr); + + Register::ValuePtr s_ptr2; + co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); + + auto compare_res_pointer + = get64BitVectorOffset(context, workitemCount, workgroupSize); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset_1 = nullptr; + co_yield Expression::generate( + v_offset_1, compare_res_pointer + s_ptr->expression(), context); + + Register::ValuePtr v_offset_2 = nullptr; + co_yield Expression::generate( + v_offset_2, compare_res_pointer + s_ptr2->expression(), context); + + // Compute the value 1 + Register::ValuePtr v_value_1 = nullptr; + co_yield Expression::generate(v_value_1, input, context); + + // Compute the value 2 + Register::ValuePtr v_value_2 = nullptr; + co_yield Expression::generate(v_value_2, widenedInput, context); + + co_yield context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8); + co_yield context->mem()->storeGlobal(v_offset_2, v_value_2, 0, 8); + }; + } + + void setTensorArguments(CommandArguments& commandArgs, + GEMMProblem const& problem, + rocRollerTest::Graphs::GEMM const& gemm, + CommandKernelPtr const& commandKernelPtr) const + { + // calling setArgument - needed + TensorDescriptor descA( + gemm.m_ta, {size_t(problem.m), size_t(problem.k)}, problem.transA); + TensorDescriptor descB( + gemm.m_tb, {size_t(problem.k), size_t(problem.n)}, problem.transB); + TensorDescriptor descC(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N"); + TensorDescriptor descD(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N"); + + // Note that actually large matrix hipMalloc is not needed. + // But at the same time, larger malloc may incurr larger base address, which can lead to overflow. + // ?? how to get float from m_dt? gemm.m_ta? + auto deviceA = make_shared_device(); + auto deviceB = make_shared_device(); + auto deviceC = make_shared_device(); + auto deviceD = make_shared_device(); + + // Note that gemm built the CommandGraph, and store OperationTags. We are reusing + // that original Command, but getting away building KernelGraph and its lowering. + setCommandTensorArg(commandArgs, gemm.m_tagTensorA, descA, deviceA.get()); + setCommandTensorArg(commandArgs, gemm.m_tagTensorB, descB, deviceB.get()); + setCommandTensorArg(commandArgs, gemm.m_tagTensorC, descC, deviceC.get()); + setCommandTensorArg(commandArgs, gemm.m_tagTensorD, descD, deviceD.get()); + + commandArgs.setArgument(gemm.m_tagScalarAlpha, ArgumentType::Value, problem.alpha); + commandArgs.setArgument(gemm.m_tagScalarBeta, ArgumentType::Value, problem.beta); + // seed doesn't seem to be relevant currently. + //if(seed.has_value()) + // commandArgs.setArgument(gemm.m_tagScalarSeed, ArgumentType::Value, seed.value()); + + // Create scratch space + if(problem.streamK) + { + commandArgs.setArgument(gemm.m_tagNumWGs, ArgumentType::Value, problem.numWGs); + } + + // ?? Is it correct to use original commandKernel or addrTestCommandKernel + auto scratchSpaceRequired + = commandKernelPtr->scratchSpaceRequired(commandArgs.runtimeArguments()); + auto deviceScratch = make_shared_device(scratchSpaceRequired, 0); + commandArgs.setArgument(gemm.m_tagScratch, ArgumentType::Value, deviceScratch.get()); + } + + void generateOrigKernelByProlog() + { + m_commandKernel = std::make_shared(m_gemmGraph.getCommand(), ""); + m_commandKernel->setContext(m_context); + m_commandKernel->setCommandParameters(m_gemmGraph.getCommandParameters()); + + Log::debug("lazyAddArguments: {}", m_context->kernelOptions().lazyAddArguments); + + // Add extra kargs for testing + m_rvTag = m_commandKernel->getCommand()->allocateTag(); + m_commandKernel->getCommand()->allocateArgument( + VariableType(DataType::UInt64, PointerType::PointerGlobal), + m_rvTag, + ArgumentType::Value, + DataDirection::WriteOnly, + "rv_ptr"); + + m_rvTag2 = m_commandKernel->getCommand()->allocateTag(); + m_commandKernel->getCommand()->allocateArgument( + VariableType(DataType::UInt64, PointerType::PointerGlobal), + m_rvTag2, + ArgumentType::Value, + DataDirection::WriteOnly, + "rv_ptr2"); + + m_commandKernel->generateKernelGraphOnlyAfterTransforms(); + + auto k = m_context->kernel(); + m_context->schedule(k->preamble()); + { + { // Original Command Arguments + auto commandArguments = m_commandKernel->getCommand()->getArguments(); + for(auto arg : commandArguments) + Log::debug("Original Arg: {}", arg->toString()); + } + + { // workitemcount + for(auto wit : k->workitemCount()) + Log::debug("Original Workitem expr: {}", toString(wit)); + } + } + + m_commandKernel->lowerToKernelArguments(); + auto commandParameters = m_commandKernel->getCommandParameters(); + auto workgroupSize = commandParameters->getManualWorkgroupSize(); + CHECK(workgroupSize.has_value()); + + { // workitemcount + for(auto wit : k->workitemCount()) + Log::debug("== Original Workitem expr: {}", toString(wit)); + } + + m_context->schedule(k->prolog()); + } + + void launchKernelAndCopyBackToHost() + { + // Setting workitem counts before launching + // Notice that addrTestCommandKernel's generateKernel() won't be called. + auto launch = std::make_shared(); + + // See the comments down there. Computation of workitemCount is redundant. + // "totalWorkitemCounts" is for debugging or logging. + std::tie(m_workitemCount, m_totalWorkitemCount) + = getWorkItemCount(m_commandKernel->getCommandParameters(), m_problem); + + // Notice that without following line of "setManualWorkitemCount". + // launch->setManualWorkitemCount(workitemCount); + // - workitemcounts were computed correctly + // within the workitemcount_kernelbody's execution. That, I think, is because + // The original graph already had workitemcount expressions as a function of + // input tensor sizes, one of the commandArguments. (e.g. Tensor_0_size_0_8) + // Thus, only the command arguments are set, workitemcounts can be computed. + // In ohter words, this manual computation of workitemcounts and setting by + // setManualWorkitemCount is not needed. Still, "getWorkItemCount" is kept + // for debugging purposes. In order to allocate device and host memory for storing + // results of kernel's execution, we still need a concrete number of allocation sizes. + + m_commandKernel->setLaunchParameters(launch); + + // What is the right check here for GPU?? + //if (isLocalDevice()) { + // addrTestCommandKernel reused original Command, for which OperationTags are stored + // in gemm, a GEMM object. + CommandArguments commandArgs = m_commandKernel->getCommand()->createArguments(); + setTensorArguments(commandArgs, m_problem, m_gemmGraph, m_commandKernel); + + auto outputSize = m_totalWorkitemCount; + auto rvPointer = make_shared_device(outputSize, 0); + commandArgs.setArgument(m_rvTag, ArgumentType::Value, rvPointer.get()); + + auto rvPointer2 = make_shared_device(outputSize, 0); + commandArgs.setArgument(m_rvTag2, ArgumentType::Value, rvPointer2.get()); + + m_commandKernel->launchKernel(commandArgs.runtimeArguments()); + + m_hostBuffer.resize(outputSize, 10); + CHECK_THAT(hipMemcpy(m_hostBuffer.data(), + rvPointer.get(), + sizeof(uint64_t) * outputSize, + hipMemcpyDefault), + HasHipSuccess(0)); + + m_hostBuffer2.resize(outputSize, 20); + CHECK_THAT(hipMemcpy(m_hostBuffer2.data(), + rvPointer2.get(), + sizeof(uint64_t) * outputSize, + hipMemcpyDefault), + HasHipSuccess(0)); + + // check hostBuffer's value + Log::debug("outputSize: {}", outputSize); + } + + void test_implicit_workitemcount() + { + generateOrigKernelByProlog(); + + auto k = m_context->kernel(); + m_context->schedule(kb_implicit_workitemcount( + m_context, + k->workitemCount()[0], + k->workitemCount()[1], + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + + m_context->schedule(k->postamble()); + m_context->schedule(k->amdgpu_metadata()); + + launchKernelAndCopyBackToHost(); + + // Remove ":" and subsequent parts to extract only leading literals. + auto const& host_x_string = toString(m_workitemCount[0]); + size_t del1 = host_x_string.find_first_of(":"); + auto const& host_x = host_x_string.substr(0, del1); + + auto const& host_y_string = toString(m_workitemCount[1]); + del1 = host_y_string.find_first_of(":"); + auto const& host_y = host_y_string.substr(0, del1); + + for(int i = 0, size = m_hostBuffer.size(); i < m_totalWorkitemCount; i++) + { + // For 128 by 128 output matrix, workgroupCount computed is {512, 2, 1} + if(toString(m_hostBuffer[i]) != host_x || toString(m_hostBuffer2[i]) != host_y) + { + std::cout + << "workitemCount.x and workitemCount.y in kernel for global workitem " << i + << ": " << m_hostBuffer[i] << ", " << m_hostBuffer2[i] << "\n"; + std::cout << "workitemCount.x and workitemCount.y in host for global workitem " + << i << ": " << host_x << ", " << host_y << "\n"; + } + CHECK(toString(m_hostBuffer[i]) == host_x); + CHECK(toString(m_hostBuffer2[i]) == host_y); + } + } + + void test_sanity_indices() + { + generateOrigKernelByProlog(); + + auto k = m_context->kernel(); + + m_context->schedule(kb_sanity_indices( + m_context, + k->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + + m_context->schedule(k->postamble()); + m_context->schedule(k->amdgpu_metadata()); + + launchKernelAndCopyBackToHost(); + + for(int i = 0; i < m_totalWorkitemCount; i++) + Log::debug("wgx {} thx {}", m_hostBuffer[i], m_hostBuffer2[i]); + + // Remove ":" and subsequent parts to extract only leading literals. + auto const& host_x_string = toString(m_workitemCount[0]); + size_t del1 = host_x_string.find_first_of(":"); + auto const& host_x = host_x_string.substr(0, del1); + auto workitemcount_x = std::stoi(host_x); + + auto const& host_y_string = toString(m_workitemCount[1]); + del1 = host_y_string.find_first_of(":"); + auto const& host_y = host_y_string.substr(0, del1); + auto workitemcount_y = std::stoi(host_y); + + Log::debug("workitemcount_x {} workitemcount_y {}", workitemcount_x, workitemcount_y); + + auto workgroupsize_x + = ((m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())[0]; + for(int rows = 0, numRows = workitemcount_y; rows < numRows; rows++) + { + auto blockIdx_y = rows % workgroupsize_x; + auto base = rows * workitemcount_x; + for(int cols = 0, numCols = workitemcount_x; cols < numCols; cols++) + { + auto blockIdx_x = cols / workgroupsize_x; + auto threadIdx_x = cols % workgroupsize_x; + + auto linearIdx = cols + base; + + CHECK(m_hostBuffer[linearIdx] == blockIdx_x); + CHECK(m_hostBuffer2[linearIdx] == threadIdx_x); + } + } + } + + void test_equal_one() + { + generateOrigKernelByProlog(); + + // Get the expression to compute + std::vector indexExprPtrs; + indexExprPtrs = AddressTrace(m_commandKernel->getKernelGraph(), m_context) + .traceComputeIndexWithBuffer(); + std::vector widenedExprPtrs; + for(int i = 0, size = indexExprPtrs.size(); i < size; i++) + { + auto eptr = indexExprPtrs[i]; + Log::debug("== Expr : {} ", toString(eptr)); + + widenedExprPtrs.push_back(Expression::widenTo64bit(eptr)); + Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); + } + + auto k = m_context->kernel(); + m_context->schedule(kb_equal_one( + m_context, + indexExprPtrs[0], + widenedExprPtrs[0], + k->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + + m_context->schedule(k->postamble()); + m_context->schedule(k->amdgpu_metadata()); + + launchKernelAndCopyBackToHost(); + + // for test_kernelbody + for(int i = 0, size = m_hostBuffer.size(); i < size; i++) + { + if(m_hostBuffer[i] != m_hostBuffer2[i]) + { + Log::debug("diff at {}: {} {}", i, m_hostBuffer[i], m_hostBuffer2[i]); + } + CHECK(m_hostBuffer[i] == m_hostBuffer2[i]); + } + } + + void test_equal() + { + generateOrigKernelByProlog(); + + // Get the expression to compute + std::vector indexExprPtrs; + indexExprPtrs = AddressTrace(m_commandKernel->getKernelGraph(), m_context) + .traceComputeIndexWithBuffer(); + std::vector widenedExprPtrs; + for(int i = 0, size = indexExprPtrs.size(); i < size; i++) + { + auto eptr = indexExprPtrs[i]; + Log::debug("== Expr : {} ", toString(eptr)); + + widenedExprPtrs.push_back(Expression::widenTo64bit(eptr)); + Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); + } + + m_context->schedule(kb_equal( + m_context, + indexExprPtrs, + widenedExprPtrs, + m_context->kernel()->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + + auto k = m_context->kernel(); + m_context->schedule(k->postamble()); + m_context->schedule(k->amdgpu_metadata()); + + launchKernelAndCopyBackToHost(); + + for(int i = 0, size = m_hostBuffer.size(); i < size; i++) + { + if(m_hostBuffer[i] != 0xFFFFFFFF) + { + std::cout << "The addresses are not same at " << i << " " << m_hostBuffer[i] + << "\n"; + } + CHECK(m_hostBuffer[i] == 0xFFFFFFFF); + } + + // hipFree will be taken care of by make_shared_device + } + + private: + ContextPtr m_context; + CommandKernelPtr m_commandKernel; + GEMMProblem const& m_problem; + rocRollerTest::Graphs::GEMM m_gemmGraph; + + rocRoller::Operations::OperationTag m_rvTag; + rocRoller::Operations::OperationTag m_rvTag2; + + // For checking results on host-side + std::array m_workitemCount; + uint m_totalWorkitemCount; + std::vector m_hostBuffer; + std::vector m_hostBuffer2; + }; + + TEST_CASE("address calculation test", "[expression][gpu][equal][float][128_128]") + { + auto context = TestContext::ForTestDevice(); + + GEMMProblem problem{.m = 128, .n = 128}; + rocRollerTest::Graphs::GEMM gemm(DataType::Float); + gemm.setProblem(problem); + + AddressCalculationTest kernel(context.get(), problem, gemm); + kernel.test_equal(); + } + + TEST_CASE("address calculation test", "[expression][gpu][equal][float][512_512]") + { + auto context = TestContext::ForTestDevice(); + + GEMMProblem problem{.m = 512, .n = 512}; + rocRollerTest::Graphs::GEMM gemm(DataType::Float); + gemm.setProblem(problem); + + AddressCalculationTest kernel(context.get(), problem, gemm); + kernel.test_equal(); + } + + TEST_CASE("address calculation test", "[expression][gpu][equal_one][float][128_128]") + { + auto context = TestContext::ForTestDevice({}, "equal_one_128x128"); + + GEMMProblem problem{.m = 128, .n = 128}; + rocRollerTest::Graphs::GEMM gemm(DataType::Float); + gemm.setProblem(problem); + + AddressCalculationTest kernel(context.get(), problem, gemm); + kernel.test_equal_one(); + } + + TEST_CASE("address calculation test", "[expression][gpu][implicit_workitemcount]") + { + auto context = TestContext::ForTestDevice({}, "impl_workitemcnt"); + + GEMMProblem problem{.m = 128, .n = 128}; + rocRollerTest::Graphs::GEMM gemm(DataType::Float); + gemm.setProblem(problem); + + AddressCalculationTest kernel(context.get(), problem, gemm); + kernel.test_implicit_workitemcount(); + } + + TEST_CASE("address calculation test", "[expression][gpu][sanity_indices]") + { + auto context = TestContext::ForTestDevice({}, "sanity_indices"); + + GEMMProblem problem{.m = 128, .n = 128}; + rocRollerTest::Graphs::GEMM gemm(DataType::Float); + gemm.setProblem(problem); + + AddressCalculationTest kernel(context.get(), problem, gemm); + kernel.test_sanity_indices(); + } +} From 33c9d1341f0c35c944083eaee48b65976f04aebf Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 5 Mar 2025 01:49:42 +0000 Subject: [PATCH 07/32] Move the location of test utility from ExpressionTransform --- lib/CMakeLists.txt | 1 - .../rocRoller/ExpressionTransformations.hpp | 10 - .../WidenTo64bit.cpp | 262 ----------------- test/CMakeLists.txt | 6 +- test/catch/AddressCalculationTest.cpp | 10 +- test/common/WidenTo64bit.cpp | 264 ++++++++++++++++++ 6 files changed, 272 insertions(+), 281 deletions(-) delete mode 100644 lib/source/ExpressionTransformations/WidenTo64bit.cpp create mode 100644 test/common/WidenTo64bit.cpp diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 77e2dbe9..805b7c86 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -75,7 +75,6 @@ set(rocroller_src source/ExpressionTransformations/LowerPRNG.cpp source/ExpressionTransformations/RestoreCommandArguments.cpp source/ExpressionTransformations/Simplify.cpp - source/ExpressionTransformations/WidenTo64bit.cpp source/CodeGen/AddInstruction.cpp source/CodeGen/ArgumentLoader.cpp diff --git a/lib/include/rocRoller/ExpressionTransformations.hpp b/lib/include/rocRoller/ExpressionTransformations.hpp index 55a3a07f..e4a730ec 100644 --- a/lib/include/rocRoller/ExpressionTransformations.hpp +++ b/lib/include/rocRoller/ExpressionTransformations.hpp @@ -135,15 +135,5 @@ namespace rocRoller * @return ExpressionPtr Transformed expression */ ExpressionPtr lowerBitfieldValues(ExpressionPtr expr); - - /** - * @brief Widen (u)int32 to (u)int64. - * - * Has many assumptions in input expr. See the implementation for details. - * - * @param expr Input expression - * @return ExpressionPtr Transformed expression - */ - ExpressionPtr widenTo64bit(ExpressionPtr expr); } } diff --git a/lib/source/ExpressionTransformations/WidenTo64bit.cpp b/lib/source/ExpressionTransformations/WidenTo64bit.cpp deleted file mode 100644 index f43b07d4..00000000 --- a/lib/source/ExpressionTransformations/WidenTo64bit.cpp +++ /dev/null @@ -1,262 +0,0 @@ -#include -#include -#include - -template -constexpr auto cast_to_unsigned(T val) -{ - return static_cast::type>(val); -} - -namespace rocRoller -{ - namespace Expression - { - struct WidenTo64BitVisitor - { - - ExpressionPtr operator()(Convert const& expr) const - { - Convert cpy = expr; - if(expr.arg) - { - // Here is an assumption that call(cpy.arg) never goes above 64-bit and - // input convert's destination types are either int32, uint32, int64 or uint64. - // resultVaribleType(expr.arg) is not called intentionally as it will visit the - // subtree of expr.arg again. - // We can also import similar logic from ExpressionResultTypeVisitor and - // make the operator()(...) return a pair of ExpressionPtr and VariableType - // in order to avoid repeated visit of the subtree of cpy.arg. - cpy.arg = call(expr.arg); - if(expr.destinationType == DataType::UInt32) - return convert(DataType::UInt64, cpy.arg); - else if(expr.destinationType == DataType::Int32) - return convert(DataType::Int64, cpy.arg); - else if(expr.destinationType == DataType::UInt64 - || expr.destinationType == DataType::Int64) - return convert(expr.destinationType, cpy.arg); - else - { - AssertFatal( - false, - "Expected Destination type for a convert is either (u)int{32,64}"); - return nullptr; - } - } - - return std::make_shared(cpy); - } - - ExpressionPtr operator()(Negate const& expr) const - { - Negate cpy = expr; - if(expr.arg) - { - cpy.arg = call(expr.arg); - } - - return std::make_shared(cpy); - } - - template - ExpressionPtr operator()(Expr const& expr) const - { - // Only Convert and perhaps negates are expected. - AssertFatal(false, "Unexpected Unary expression", ShowValue(expr)); - return nullptr; - } - - ExpressionPtr operator()(Divide const& expr) const - { - // TODO: Add check that divisor is a CValue within (u)int32-bit - Log::debug("Divisor: {} ", toString(expr.rhs)); - - Divide cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - - return std::make_shared(cpy); - } - - ExpressionPtr operator()(Modulo const& expr) const - { - // TODO: Add check that divisor is a CValue within (u)int32-bit - Log::debug("Modulo: {} ", toString(expr.rhs)); - - Modulo cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - - return std::make_shared(cpy); - } - - // TODO: for some shifts, logical, subtract operations these may not be correct. - // Not sure yet if address calculation expressions with identity_transducer - // those types of Expression or not. - template - requires(CArithmetic) ExpressionPtr operator()(Expr const& expr) const - { - if constexpr(std::same_as) - { - AssertFatal(false, "Subtracts are not expected"); - return nullptr; - } - - if constexpr(CLogical) - { - AssertFatal(false, "logicals are not expected"); - return nullptr; - } - - Expr cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - if(expr.rhs) - cpy.rhs = call(expr.rhs); - - return std::make_shared(cpy); - } - - // Even with catch-all operator()(ExpressionPtr) without following, - // compilation fails. - template - requires(CBinary) ExpressionPtr operator()(Expr const& expr) const - { - AssertFatal(false, "Not expected expr : ", ShowValue(expr)); - return nullptr; - } - - template - requires(CArithmetic) ExpressionPtr operator()(Expr const& expr) const - { - Expr cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - if(expr.r1hs) - cpy.r1hs = call(expr.r1hs); - if(expr.r2hs) - cpy.r2hs = call(expr.r2hs); - - return std::make_shared(cpy); - } - - template - requires(CTernary) ExpressionPtr operator()(Expr const& expr) const - { - AssertFatal(false, "Not expected expr : ", ShowValue(expr)); - return nullptr; - } - - // leaves - ExpressionPtr operator()(CommandArgumentPtr const& expr) const - { - Log::debug("CommandArgumentPtr {}", toString(expr)); - - auto varType = expr->variableType(); - - assertIfNotExpectedType(varType.dataType, toString(expr)); - - CommandArgumentPtr cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - ExpressionPtr operator()(CommandArgumentValue const& expr) const - { - Log::debug("CommandArgumentValue {} Type {} ", - toString(expr), - toString(variableType(expr))); - - auto varType = variableType(expr); - - assertIfNotExpectedType(varType.dataType, toString(expr)); - - CommandArgumentValue cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - ExpressionPtr operator()(Register::ValuePtr const& expr) const - { - Log::debug("Register::ValuePtr {}", toString(expr)); - - auto varType = expr->variableType(); - - assertIfNotExpectedType(varType.dataType, toString(expr)); - - Register::ValuePtr cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const - { - Log::debug("AssemblyKernelArgumentPtr {} its expression is {}", - toString(expr), - toString(expr->expression)); - - auto varType = expr->variableType; - - assertIfNotExpectedType(varType.dataType, toString(expr)); - - AssemblyKernelArgumentPtr cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - // catch the rest CValue - template - ExpressionPtr operator()(Value const& expr) const - { - AssertFatal( - false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr)); - return nullptr; - } - - ExpressionPtr operator()(Expression const& expr) const - { - AssertFatal( - false, "No expectation to meet this type of Expression: ", ShowValue(expr)); - return nullptr; - } - - template - ExpressionPtr widenTo64(DataType srcType, T const& expr) const - { - if(srcType == DataType::UInt32) - return convert(DataType::UInt64, std::make_shared(expr)); - else if(srcType == DataType::Int32) - return convert(DataType::Int64, std::make_shared(expr)); - - return std::make_shared(expr); - } - - ExpressionPtr call(ExpressionPtr const& expr) const - { - return std::visit(*this, *expr); - } - - void assertIfNotExpectedType(DataType dt, std::string const& showValue) const - { - AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32 - || dt == DataType::UInt64 || dt == DataType::Int64, - "Unexpected DataType for Command/Kernel arguments or " - "workgroup/item indices ", - showValue); - } - }; - - ExpressionPtr widenTo64bit(ExpressionPtr expr) - { - auto origVarType = resultVariableType(expr); - - auto visitor = WidenTo64BitVisitor(); - auto widened = visitor.call(expr); - - auto finalVarType = resultVariableType(widened); - - AssertFatal(origVarType.dataType == finalVarType.dataType, - "Original and final data types should be the same", - ShowValue(origVarType.dataType), - ShowValue(finalVarType.dataType)); - - return widened; - } - } -} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2caf0905..e9901587 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -28,7 +28,11 @@ target_include_directories( ) add_library( - common_test_utilities OBJECT common/CommonGraphs.cpp common/Utilities.cpp common/mxDataGen.cpp + common_test_utilities OBJECT + common/CommonGraphs.cpp + common/Utilities.cpp + common/mxDataGen.cpp + common/WidenTo64bit.cpp ) target_compile_options( diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index c60e6494..f72c4766 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -4,10 +4,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -610,10 +610,6 @@ namespace AddressCalculationTest m_commandKernel->setLaunchParameters(launch); - // What is the right check here for GPU?? - //if (isLocalDevice()) { - // addrTestCommandKernel reused original Command, for which OperationTags are stored - // in gemm, a GEMM object. CommandArguments commandArgs = m_commandKernel->getCommand()->createArguments(); setTensorArguments(commandArgs, m_problem, m_gemmGraph, m_commandKernel); @@ -750,7 +746,7 @@ namespace AddressCalculationTest auto eptr = indexExprPtrs[i]; Log::debug("== Expr : {} ", toString(eptr)); - widenedExprPtrs.push_back(Expression::widenTo64bit(eptr)); + widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr)); Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); } @@ -792,7 +788,7 @@ namespace AddressCalculationTest auto eptr = indexExprPtrs[i]; Log::debug("== Expr : {} ", toString(eptr)); - widenedExprPtrs.push_back(Expression::widenTo64bit(eptr)); + widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr)); Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); } diff --git a/test/common/WidenTo64bit.cpp b/test/common/WidenTo64bit.cpp new file mode 100644 index 00000000..7758628c --- /dev/null +++ b/test/common/WidenTo64bit.cpp @@ -0,0 +1,264 @@ +#include +#include +#include + +template +constexpr auto cast_to_unsigned(T val) +{ + return static_cast::type>(val); +} + +namespace rocRollerTest +{ + + using namespace rocRoller; + + struct WidenTo64BitVisitor + { + + Expression::ExpressionPtr operator()(Expression::Convert const& expr) const + { + Expression::Convert cpy = expr; + if(expr.arg) + { + // Here is an assumption that call(cpy.arg) never goes above 64-bit and + // input convert's destination types are either int32, uint32, int64 or uint64. + // resultVaribleType(expr.arg) is not called intentionally as it will visit the + // subtree of expr.arg again. + // We can also import similar logic from ExpressionResultTypeVisitor and + // make the operator()(...) return a pair of Expression::ExpressionPtr and VariableType + // in order to avoid repeated visit of the subtree of cpy.arg. + cpy.arg = call(expr.arg); + if(expr.destinationType == DataType::UInt32) + return convert(DataType::UInt64, cpy.arg); + else if(expr.destinationType == DataType::Int32) + return convert(DataType::Int64, cpy.arg); + else if(expr.destinationType == DataType::UInt64 + || expr.destinationType == DataType::Int64) + return convert(expr.destinationType, cpy.arg); + else + { + AssertFatal(false, + "Expected Destination type for a convert is either (u)int{32,64}"); + return nullptr; + } + } + + return std::make_shared(cpy); + } + + Expression::ExpressionPtr operator()(Expression::Negate const& expr) const + { + Expression::Negate cpy = expr; + if(expr.arg) + { + cpy.arg = call(expr.arg); + } + + return std::make_shared(cpy); + } + + template + Expression::ExpressionPtr operator()(Expr const& expr) const + { + // Only Expression::Convert and perhaps negates are expected. + AssertFatal(false, "Unexpected Unary expression", ShowValue(expr)); + return nullptr; + } + + Expression::ExpressionPtr operator()(Expression::Divide const& expr) const + { + // TODO: Add check that divisor is a CValue within (u)int32-bit + Log::debug("Divisor: {} ", toString(expr.rhs)); + + Expression::Divide cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + + return std::make_shared(cpy); + } + + Expression::ExpressionPtr operator()(Expression::Modulo const& expr) const + { + // TODO: Add check that divisor is a CValue within (u)int32-bit + Log::debug("Modulo: {} ", toString(expr.rhs)); + + Expression::Modulo cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + + return std::make_shared(cpy); + } + + // TODO: for some shifts, logical, subtract operations these may not be correct. + // Not sure yet if address calculation expressions with identity_transducer + // those types of Expression or not. + template + requires(Expression::CArithmetic) Expression::ExpressionPtr + operator()(Expr const& expr) const + { + if constexpr(std::same_as) + { + AssertFatal(false, "Subtracts are not expected"); + return nullptr; + } + + if constexpr(Expression::CLogical) + { + AssertFatal(false, "logicals are not expected"); + return nullptr; + } + + Expr cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + if(expr.rhs) + cpy.rhs = call(expr.rhs); + + return std::make_shared(cpy); + } + + // Even with catch-all operator()(Expression::ExpressionPtr) without following, + // compilation fails. + template + requires(Expression::CBinary) Expression::ExpressionPtr + operator()(Expr const& expr) const + { + AssertFatal(false, "Not expected expr : ", ShowValue(expr)); + return nullptr; + } + + template + requires(Expression::CArithmetic) Expression::ExpressionPtr + operator()(Expr const& expr) const + { + Expr cpy = expr; + if(expr.lhs) + cpy.lhs = call(expr.lhs); + if(expr.r1hs) + cpy.r1hs = call(expr.r1hs); + if(expr.r2hs) + cpy.r2hs = call(expr.r2hs); + + return std::make_shared(cpy); + } + + template + requires(Expression::CTernary) Expression::ExpressionPtr + operator()(Expr const& expr) const + { + AssertFatal(false, "Not expected expr : ", ShowValue(expr)); + return nullptr; + } + + // leaves + Expression::ExpressionPtr operator()(CommandArgumentPtr const& expr) const + { + Log::debug("CommandArgumentPtr {}", Expression::toString(expr)); + + auto varType = expr->variableType(); + + assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); + + CommandArgumentPtr cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + Expression::ExpressionPtr operator()(CommandArgumentValue const& expr) const + { + Log::debug("CommandArgumentValue {} Type {} ", + Expression::toString(expr), + toString(variableType(expr))); + + auto varType = variableType(expr); + + assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); + + CommandArgumentValue cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + Expression::ExpressionPtr operator()(Register::ValuePtr const& expr) const + { + Log::debug("Register::ValuePtr {}", Expression::toString(expr)); + + auto varType = expr->variableType(); + + assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); + + Register::ValuePtr cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + Expression::ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const + { + Log::debug("AssemblyKernelArgumentPtr {} its expression is {}", + Expression::toString(expr), + Expression::toString(expr->expression)); + + auto varType = expr->variableType; + + assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); + + AssemblyKernelArgumentPtr cpy = expr; + return widenTo64(varType.dataType, cpy); + } + + // catch the rest CValue + template + Expression::ExpressionPtr operator()(Value const& expr) const + { + AssertFatal( + false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr)); + return nullptr; + } + + Expression::ExpressionPtr operator()(Expression::Expression const& expr) const + { + AssertFatal(false, "No expectation to meet this type of Expression: ", ShowValue(expr)); + return nullptr; + } + + template + Expression::ExpressionPtr widenTo64(DataType srcType, T const& expr) const + { + if(srcType == DataType::UInt32) + return convert(DataType::UInt64, std::make_shared(expr)); + else if(srcType == DataType::Int32) + return convert(DataType::Int64, std::make_shared(expr)); + + return std::make_shared(expr); + } + + Expression::ExpressionPtr call(Expression::ExpressionPtr const& expr) const + { + return std::visit(*this, *expr); + } + + void assertIfNotExpectedType(DataType dt, std::string const& showValue) const + { + AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32 || dt == DataType::UInt64 + || dt == DataType::Int64, + "Unexpected DataType for Command/Kernel arguments or " + "workgroup/item indices ", + showValue); + } + }; + + Expression::ExpressionPtr widenTo64bit(Expression::ExpressionPtr expr) + { + auto origVarType = resultVariableType(expr); + + auto visitor = WidenTo64BitVisitor(); + auto widened = visitor.call(expr); + + auto finalVarType = resultVariableType(widened); + + AssertFatal(origVarType.dataType == finalVarType.dataType, + "Original and final data types should be the same", + ShowValue(origVarType.dataType), + ShowValue(finalVarType.dataType)); + + return widened; + } +} From 05f9cc5a929e0b7ce8676c6cbee0ceb3a2b69a21 Mon Sep 17 00:00:00 2001 From: Choi Date: Wed, 5 Mar 2025 13:55:43 -0500 Subject: [PATCH 08/32] Adding missing header file --- test/common/common/WidenTo64bit.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 test/common/common/WidenTo64bit.hpp diff --git a/test/common/common/WidenTo64bit.hpp b/test/common/common/WidenTo64bit.hpp new file mode 100644 index 00000000..33a84085 --- /dev/null +++ b/test/common/common/WidenTo64bit.hpp @@ -0,0 +1,12 @@ +#include +namespace rocRollerTest { +/** + * @brief Widen (u)int32 to (u)int64. + * + * Has many assumptions in input expr. See the implementation for details. + * + * @param expr Input expression + * @return ExpressionPtr Transformed expression + */ + rocRoller::Expression::ExpressionPtr widenTo64bit(rocRoller::Expression::ExpressionPtr expr); +} From 002aa302a787aaf8af381bca5b4f779fbfae0084 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 6 Mar 2025 23:19:44 +0000 Subject: [PATCH 09/32] Renaming of member variables to remove CodeQL errors --- test/catch/AddressCalculationTest.cpp | 24 ++++++------- test/common/CommonGraphs.cpp | 50 +++++++++++++-------------- test/common/common/CommonGraphs.hpp | 8 ++--- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index f72c4766..355260e6 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -490,11 +490,11 @@ namespace AddressCalculationTest { // calling setArgument - needed TensorDescriptor descA( - gemm.m_ta, {size_t(problem.m), size_t(problem.k)}, problem.transA); + gemm.mTa, {size_t(problem.m), size_t(problem.k)}, problem.transA); TensorDescriptor descB( - gemm.m_tb, {size_t(problem.k), size_t(problem.n)}, problem.transB); - TensorDescriptor descC(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N"); - TensorDescriptor descD(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N"); + gemm.mTb, {size_t(problem.k), size_t(problem.n)}, problem.transB); + TensorDescriptor descC(gemm.mTd, {size_t(problem.m), size_t(problem.n)}, "N"); + TensorDescriptor descD(gemm.mTd, {size_t(problem.m), size_t(problem.n)}, "N"); // Note that actually large matrix hipMalloc is not needed. // But at the same time, larger malloc may incurr larger base address, which can lead to overflow. @@ -506,13 +506,13 @@ namespace AddressCalculationTest // Note that gemm built the CommandGraph, and store OperationTags. We are reusing // that original Command, but getting away building KernelGraph and its lowering. - setCommandTensorArg(commandArgs, gemm.m_tagTensorA, descA, deviceA.get()); - setCommandTensorArg(commandArgs, gemm.m_tagTensorB, descB, deviceB.get()); - setCommandTensorArg(commandArgs, gemm.m_tagTensorC, descC, deviceC.get()); - setCommandTensorArg(commandArgs, gemm.m_tagTensorD, descD, deviceD.get()); + setCommandTensorArg(commandArgs, gemm.mTagTensorA, descA, deviceA.get()); + setCommandTensorArg(commandArgs, gemm.mTagTensorB, descB, deviceB.get()); + setCommandTensorArg(commandArgs, gemm.mTagTensorC, descC, deviceC.get()); + setCommandTensorArg(commandArgs, gemm.mTagTensorD, descD, deviceD.get()); - commandArgs.setArgument(gemm.m_tagScalarAlpha, ArgumentType::Value, problem.alpha); - commandArgs.setArgument(gemm.m_tagScalarBeta, ArgumentType::Value, problem.beta); + commandArgs.setArgument(gemm.mTagScalarAlpha, ArgumentType::Value, problem.alpha); + commandArgs.setArgument(gemm.mTagScalarBeta, ArgumentType::Value, problem.beta); // seed doesn't seem to be relevant currently. //if(seed.has_value()) // commandArgs.setArgument(gemm.m_tagScalarSeed, ArgumentType::Value, seed.value()); @@ -520,14 +520,14 @@ namespace AddressCalculationTest // Create scratch space if(problem.streamK) { - commandArgs.setArgument(gemm.m_tagNumWGs, ArgumentType::Value, problem.numWGs); + commandArgs.setArgument(gemm.mTagNumWGs, ArgumentType::Value, problem.numWGs); } // ?? Is it correct to use original commandKernel or addrTestCommandKernel auto scratchSpaceRequired = commandKernelPtr->scratchSpaceRequired(commandArgs.runtimeArguments()); auto deviceScratch = make_shared_device(scratchSpaceRequired, 0); - commandArgs.setArgument(gemm.m_tagScratch, ArgumentType::Value, deviceScratch.get()); + commandArgs.setArgument(gemm.mTagScratch, ArgumentType::Value, deviceScratch.get()); } void generateOrigKernelByProlog() diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp index 294b37f4..1449d5c5 100644 --- a/test/common/CommonGraphs.cpp +++ b/test/common/CommonGraphs.cpp @@ -182,10 +182,10 @@ namespace rocRollerTest::Graphs { } GEMM::GEMM(DataType ta, DataType tb, DataType tc, DataType td) - : m_ta(ta) - , m_tb(tb) - , m_tc(tc) - , m_td(td) + : mTa(ta) + , mTb(tb) + , mTc(tc) + , mTd(td) { } @@ -200,27 +200,27 @@ namespace rocRollerTest::Graphs ? std::vector({(size_t)0, (size_t)1}) : std::vector({}); - m_tagTensorA = m_command->addOperation(rocRoller::Operations::Tensor( - 2, m_ta, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A + mTagTensorA = m_command->addOperation(rocRoller::Operations::Tensor( + 2, mTa, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A - m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorA)); + m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorA)); - m_tagTensorB = m_command->addOperation(rocRoller::Operations::Tensor( - 2, m_tb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B - m_tagB = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorB)); + mTagTensorB = m_command->addOperation(rocRoller::Operations::Tensor( + 2, mTb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B + m_tagB = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorB)); - m_tagTensorC - = m_command->addOperation(rocRoller::Operations::Tensor(2, m_tc, oneStridesN)); // C - m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorC)); + mTagTensorC + = m_command->addOperation(rocRoller::Operations::Tensor(2, mTc, oneStridesN)); // C + m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorC)); - m_tagScalarAlpha + mTagScalarAlpha = m_command->addOperation(rocRoller::Operations::Scalar(DataType::Float)); // alpha auto tagLoadAlpha - = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(m_tagScalarAlpha)); + = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarAlpha)); - m_tagScalarBeta = m_command->addOperation(rocRoller::Operations::Scalar(m_tc)); // beta - auto tagLoadBeta = m_command->addOperation( - rocRoller::Operations::T_Load_Scalar(m_tagScalarBeta)); // beta + mTagScalarBeta = m_command->addOperation(rocRoller::Operations::Scalar(mTc)); // beta + auto tagLoadBeta + = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarBeta)); // beta auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B @@ -244,23 +244,23 @@ namespace rocRollerTest::Graphs } m_command->addOperation(std::move(execute)); - m_tagTensorD - = m_command->addOperation(rocRoller::Operations::Tensor(2, m_td, oneStridesN)); // D - m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, m_tagTensorD)); // D + mTagTensorD + = m_command->addOperation(rocRoller::Operations::Tensor(2, mTd, oneStridesN)); // D + m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, mTagTensorD)); // D if(m_problem.streamK) { - m_tagNumWGs = m_command->allocateTag(); + mTagNumWGs = m_command->allocateTag(); auto numWGsArg = m_command->allocateArgument(DataType::UInt32, - m_tagNumWGs, + mTagNumWGs, ArgumentType::Value, DataDirection::ReadOnly, rocRoller::NUMWGS); } - m_tagScratch = m_command->allocateTag(); + mTagScratch = m_command->allocateTag(); m_command->allocateArgument(VariableType(DataType::UInt32, PointerType::PointerGlobal), - m_tagScratch, + mTagScratch, ArgumentType::Value, DataDirection::ReadWrite, rocRoller::SCRATCH); diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp index 7485c5cc..95bd6ee5 100644 --- a/test/common/common/CommonGraphs.hpp +++ b/test/common/common/CommonGraphs.hpp @@ -178,11 +178,11 @@ namespace rocRollerTest GEMMProblem const& getProblem() const; CommandParametersPtr getCommandParameters() const; - rocRoller::Operations::OperationTag m_tagTensorA, m_tagTensorB, m_tagTensorC, - m_tagTensorD, m_tagScalarAlpha, m_tagScalarBeta, m_tagScalarSeed, m_tagScratch; - rocRoller::Operations::OperationTag m_tagNumWGs; + rocRoller::Operations::OperationTag mTagTensorA, mTagTensorB, mTagTensorC, mTagTensorD, + mTagScalarAlpha, mTagScalarBeta, mTagScalarSeed, mTagScratch; + rocRoller::Operations::OperationTag mTagNumWGs; - DataType m_ta, m_tb, m_tc, m_td; + DataType mTa, mTb, mTc, mTd; private: void createCommand(); From df6336882776fe3aa3a72049ecf3f5b6555bda23 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Fri, 7 Mar 2025 03:40:35 +0000 Subject: [PATCH 10/32] Reorganize the tests to cover multiple sizes and data types. --- test/catch/AddressCalculationTest.cpp | 220 +++++++++++++------------- test/common/common/TestValues.hpp | 11 ++ 2 files changed, 121 insertions(+), 110 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 355260e6..49aa0fc1 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -3,6 +3,7 @@ #include "TestContext.hpp" #include #include +#include #include #include #include @@ -361,85 +362,6 @@ namespace AddressCalculationTest }; } - auto kb_equal(ContextPtr context, - const std::vector& input, - const std::vector& widenedInput, - std::array const& workitemCount, - std::array const& workgroupSize) - { - auto allone_uint64 - = std::make_shared(static_cast(0xFFFFFFFF)); - - return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]() - -> Generator { - // store base addr - Register::ValuePtr s_ptr; - - co_yield context->argLoader()->getValue("rv_ptr", s_ptr); - - Register::ValuePtr s_ptr2; - - co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); - - // 2-D - auto compare_res_pointer - = get64BitVectorOffset(context, workitemCount, workgroupSize); - Log::debug("Offset in kb: {}", toString(compare_res_pointer)); - - Register::ValuePtr v_offset = nullptr; - co_yield Expression::generate( - v_offset, compare_res_pointer + s_ptr->expression(), context); - - // may not be needed. - //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug"))); - - // boolean_diff was allocated to s[0:1] - // diff should be computed per lane, - // but is_zero_diff is actually one-bit - auto boolean_true = Register::Value::WavefrontPlaceholder(context); - Register::ValuePtr v_allone; - co_yield Expression::generate(v_allone, allone_uint64, context); - - co_yield context->copier()->copy( - boolean_true, v_allone, "set to true for all lanes"); - - Register::ValuePtr temp_res; - for(int i = 0, size = input.size(); i < size; i++) - { - // Compute the value 1 - Register::ValuePtr v_value_1 = nullptr; - co_yield Expression::generate(v_value_1, input[i], context); - - // Compute the value 2 - Register::ValuePtr v_value_2 = nullptr; - co_yield Expression::generate(v_value_2, widenedInput[i], context); - - // Compute diff (value_1 - value_2) - auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); - { - auto boolType = resultVariableType(is_zero_diff).dataType; - AssertFatal( - boolType == DataType::Bool64, "is_zero type {}", toString(boolType)); - } - - // boolean_true = boolean_true | is_zero_diff - auto accumRes = std::make_shared( - Expression::BitwiseAnd{boolean_true->expression(), is_zero_diff, "accum"}); - { - auto accumType = resultVariableType(accumRes).dataType; - AssertFatal( - accumType == DataType::Bool64, "accum type {}", toString(accumType)); - } - co_yield Expression::generate(boolean_true, accumRes, context); - } - - auto v_value = Register::Value::Placeholder( - context, Register::Type::Vector, DataType::UInt64, 1); - co_yield context->copier()->copy(v_value, boolean_true, "Move value"); - co_yield context->mem()->storeGlobal(v_offset, v_value, 0, 8); - }; - } - // For now, mainly for debugging, directly copy the two results values. // workitemCount is passed as argument. Since it is 64-bit // different global_store_dwordx2 format should be used. @@ -792,12 +714,86 @@ namespace AddressCalculationTest Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); } - m_context->schedule(kb_equal( - m_context, - indexExprPtrs, - widenedExprPtrs, - m_context->kernel()->workitemCount(), - (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + auto allone_uint64 + = std::make_shared(static_cast(0xFFFFFFFF)); + + auto kb = [&]() + // return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]() + -> Generator { + // store base addr + Register::ValuePtr s_ptr; + + co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr); + + Register::ValuePtr s_ptr2; + + co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2); + + // 2-D + auto compare_res_pointer = get64BitVectorOffset( + m_context, + m_context->kernel()->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()) + .value()); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset = nullptr; + co_yield Expression::generate( + v_offset, compare_res_pointer + s_ptr->expression(), m_context); + + // may not be needed. + //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug"))); + + // boolean_diff was allocated to s[0:1] + // diff should be computed per lane, + // but is_zero_diff is actually one-bit + auto boolean_true = Register::Value::WavefrontPlaceholder(m_context); + Register::ValuePtr v_allone; + co_yield Expression::generate(v_allone, allone_uint64, m_context); + + co_yield m_context->copier()->copy( + boolean_true, v_allone, "set to true for all lanes"); + + Register::ValuePtr temp_res; + for(int i = 0, size = indexExprPtrs.size(); i < size; i++) + { + // Compute the value 1 + Register::ValuePtr v_value_1 = nullptr; + co_yield Expression::generate(v_value_1, indexExprPtrs[i], m_context); + + // Compute the value 2 + Register::ValuePtr v_value_2 = nullptr; + co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context); + + // Compute diff (value_1 - value_2) + auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); + { + auto boolType = resultVariableType(is_zero_diff).dataType; + AssertFatal(boolType == DataType::Bool64, + "is_zero type {}", + toString(boolType)); + } + + // boolean_true = boolean_true | is_zero_diff + auto accumRes + = std::make_shared(Expression::BitwiseAnd{ + boolean_true->expression(), is_zero_diff, "accum"}); + { + auto accumType = resultVariableType(accumRes).dataType; + AssertFatal(accumType == DataType::Bool64, + "accum type {}", + toString(accumType)); + } + co_yield Expression::generate(boolean_true, accumRes, m_context); + } + + auto v_value = Register::Value::Placeholder( + m_context, Register::Type::Vector, DataType::UInt64, 1); + co_yield m_context->copier()->copy(v_value, boolean_true, "Move value"); + co_yield m_context->mem()->storeGlobal(v_offset, v_value, 0, 8); + }; + + m_context->schedule(kb()); auto k = m_context->kernel(); m_context->schedule(k->postamble()); @@ -834,33 +830,37 @@ namespace AddressCalculationTest std::vector m_hostBuffer2; }; - TEST_CASE("address calculation test", "[expression][gpu][equal][float][128_128]") - { - auto context = TestContext::ForTestDevice(); - - GEMMProblem problem{.m = 128, .n = 128}; - rocRollerTest::Graphs::GEMM gemm(DataType::Float); - gemm.setProblem(problem); - - AddressCalculationTest kernel(context.get(), problem, gemm); - kernel.test_equal(); - } - - TEST_CASE("address calculation test", "[expression][gpu][equal][float][512_512]") + TEST_CASE("address calculation test generate and run", "[expression][gpu]") { - auto context = TestContext::ForTestDevice(); + // Single here means applied to all three A, B, C matrices. + // TODO: Add more dataTypes + std::vector singleDataTypes = {DataType::Float}; - GEMMProblem problem{.m = 512, .n = 512}; - rocRollerTest::Graphs::GEMM gemm(DataType::Float); - gemm.setProblem(problem); - - AddressCalculationTest kernel(context.get(), problem, gemm); - kernel.test_equal(); + for(auto dataType : singleDataTypes) + { + for(auto [m, n, macM, macN] : TestValues::gemmProblemSizes) + { + // Come up with a string from problem_size and data type, to be given to ForTestDevice(); + auto suffixForKernelName = std::to_string(m) + "x" + std::to_string(n) + "_" + + std::to_string(macM) + "_" + std::to_string(macN); + auto context = TestContext::ForTestDevice({}, suffixForKernelName); + + GEMMProblem problem{.m = m, .n = n, .macM = macM, .macN = macN}; + rocRollerTest::Graphs::GEMM gemm(dataType); + gemm.setProblem(problem); + CAPTURE(dataType, m, n, macM, macN); + + AddressCalculationTest kernel(context.get(), problem, gemm); + // Generate a kernel for testing address calculation and run. + // Verification of the result is done. + kernel.test_equal(); + } + } } - TEST_CASE("address calculation test", "[expression][gpu][equal_one][float][128_128]") + TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]") { - auto context = TestContext::ForTestDevice({}, "equal_one_128x128"); + auto context = TestContext::ForTestDevice({}, "128x128_one_pair"); GEMMProblem problem{.m = 128, .n = 128}; rocRollerTest::Graphs::GEMM gemm(DataType::Float); @@ -870,7 +870,7 @@ namespace AddressCalculationTest kernel.test_equal_one(); } - TEST_CASE("address calculation test", "[expression][gpu][implicit_workitemcount]") + TEST_CASE("address calculation test implicit workitemcount", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "impl_workitemcnt"); @@ -882,9 +882,9 @@ namespace AddressCalculationTest kernel.test_implicit_workitemcount(); } - TEST_CASE("address calculation test", "[expression][gpu][sanity_indices]") + TEST_CASE("address calculation test sanity check", "[expression][gpu]") { - auto context = TestContext::ForTestDevice({}, "sanity_indices"); + auto context = TestContext::ForTestDevice({}, "128x128_sanity_indices"); GEMMProblem problem{.m = 128, .n = 128}; rocRollerTest::Graphs::GEMM gemm(DataType::Float); diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index 2b123fca..1d0bf507 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -112,6 +112,17 @@ namespace TestValues 12981.0, 42e5}; + // Portions of GEMMProblem + struct gemmProblemSize + { + int m; + int n; + int macM; + int macN; + }; + + inline std::vector gemmProblemSizes = {{128, 128, 64, 64}, {512, 512, 64, 64}}; + template struct ByType { From d3cec12605c1774774b2ca3a683886a7c79429be Mon Sep 17 00:00:00 2001 From: "yoonchoi@amd.com" Date: Mon, 10 Mar 2025 19:52:39 +0000 Subject: [PATCH 11/32] Using GEMM(DataType ..) in place of GEMM --- test/catch/KernelGraphRemoveDuplicatesTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp index 9bb470c5..df871c0b 100644 --- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp +++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp @@ -42,7 +42,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]") using namespace rocRoller::KernelGraph::ControlGraph; auto ctx = TestContext::ForDefaultTarget().get(); - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); example.setTileSize(128, 128, 32); example.setMFMA(32, 32, 16, 1); From efa60dbceb89bce17a741624035c00ca0d8697a2 Mon Sep 17 00:00:00 2001 From: Lauren Wrubleski Date: Mon, 10 Mar 2025 20:43:49 +0000 Subject: [PATCH 12/32] Allow configuration of mxDataGenerator source --- CMakeLists.txt | 3 +++ cmake/Dependencies.cmake | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c539139..066cff06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,9 @@ option(BUILD_TESTING "Build rocRoller test clients" ON) option(BUILD_DOCS "Build rocRoller documentation" ON) option(BUILD_VERBOSE "Output additional build information" OFF) +option(MXDATAGENERATOR_SSH "Fetch mxDataGenerator via SSH" OFF) +set(MXDATAGENERATOR_GIT_URL "github.com" CACHE STRING "Base Git URL to fetch mxDataGenerator from.") + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index c47ee114..66d1c8dc 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -441,6 +441,11 @@ endfunction() function(_fetch_mxDataGenerator VERSION HASH) _determine_git_tag(v main) + if(MXDATAGENERATOR_SSH) + set(mxDataGenerator_url "git@${MXDATAGENERATOR_GIT_URL}:ROCm/mxDataGenerator.git") + else() + set(mxDataGenerator_url "https://${MXDATAGENERATOR_GIT_URL}/ROCm/mxDataGenerator.git") + endif() FetchContent_Declare( mxDataGenerator GIT_REPOSITORY git@github.com:ROCm/mxDataGenerator.git From 729a26622c610730e867d0a87893e0f2475855d2 Mon Sep 17 00:00:00 2001 From: Lauren Wrubleski Date: Tue, 11 Mar 2025 19:15:08 +0000 Subject: [PATCH 13/32] actually use computed mxDataGenerator URL --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 66d1c8dc..1b161f86 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -448,7 +448,7 @@ function(_fetch_mxDataGenerator VERSION HASH) endif() FetchContent_Declare( mxDataGenerator - GIT_REPOSITORY git@github.com:ROCm/mxDataGenerator.git + GIT_REPOSITORY ${mxDataGenerator_url} GIT_TAG ${GIT_TAG} ) FetchContent_MakeAvailable(mxDataGenerator) From efba1dcd0a33555959f9ffabfa970d5667a893da Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 11 Mar 2025 21:45:39 -0400 Subject: [PATCH 14/32] LWP100-1465 Common GEMM graph: replacing templatized dataType with member variable --- test/catch/CommandTest.cpp | 8 +- test/catch/IdentifyParallelDimensionsTest.cpp | 4 +- .../catch/KernelGraphRemoveDuplicatesTest.cpp | 12 +- test/common/CommonGraphs.cpp | 239 ++++++++++++++++++ test/common/common/CommonGraphs.hpp | 24 +- test/common/common/CommonGraphs_impl.hpp | 126 --------- test/unit/KernelGraphTest.cpp | 16 +- .../unit/KernelGraphTest/UpdateParameters.cpp | 12 +- 8 files changed, 286 insertions(+), 155 deletions(-) diff --git a/test/catch/CommandTest.cpp b/test/catch/CommandTest.cpp index 85bce25a..2bfd8918 100644 --- a/test/catch/CommandTest.cpp +++ b/test/catch/CommandTest.cpp @@ -38,9 +38,9 @@ namespace CommandTest { SECTION("GEMM/TileAdd") { - auto example1 = *rocRollerTest::Graphs::GEMM().getCommand(); - auto example2 = *rocRollerTest::Graphs::GEMM().getCommand(); - auto example3 = *rocRollerTest::Graphs::GEMM().getCommand(); + auto example1 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand(); + auto example2 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand(); + auto example3 = *rocRollerTest::Graphs::GEMM(DataType::Half).getCommand(); auto example4 = *rocRollerTest::Graphs::TileDoubleAdd().getCommand(); CHECK(example1 == example2); @@ -58,7 +58,7 @@ namespace CommandTest { SECTION("GEMM") { - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); auto command0 = example.getCommand(); auto yaml = Command::toYAML(*command0); diff --git a/test/catch/IdentifyParallelDimensionsTest.cpp b/test/catch/IdentifyParallelDimensionsTest.cpp index 7f46a0f1..62973404 100644 --- a/test/catch/IdentifyParallelDimensionsTest.cpp +++ b/test/catch/IdentifyParallelDimensionsTest.cpp @@ -74,7 +74,7 @@ TEST_CASE("identifyParallelDimensionSets works for GEMM", "[kernel-graph]") using namespace rocRoller; auto ctx = TestContext::ForDefaultTarget(); - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); auto kgraph = KernelGraph::translate(example.getCommand()); @@ -175,7 +175,7 @@ SCENARIO("IdentifyParallelDimensions transformation works for GEMM", "[kernel-gr using namespace rocRoller; auto ctx = TestContext::ForDefaultTarget(); - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); GIVEN("The initial kernel graph for a GEMM") { diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp index 9bb470c5..01812d2a 100644 --- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp +++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp @@ -42,7 +42,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]") using namespace rocRoller::KernelGraph::ControlGraph; auto ctx = TestContext::ForDefaultTarget().get(); - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); example.setTileSize(128, 128, 32); example.setMFMA(32, 32, 16, 1); @@ -72,16 +72,16 @@ TEST_CASE("Remove duplicates", "[kernel-graph]") // LoadTiled: A A, B B, C C // After removing 2x1 jamming: A, B, C C - CHECK(graph0.control.getElements().to().size() == 6); - CHECK(graph1.control.getElements().to().size() == 4); + CHECK(graph0.control.getElements().to().size() == 3); + CHECK(graph1.control.getElements().to().size() == 3); // StoreLDSTile: A A, B B // After removing 2x1 jamming: A, B - CHECK(graph0.control.getElements().to().size() == 4); + CHECK(graph0.control.getElements().to().size() == 2); CHECK(graph1.control.getElements().to().size() == 2); // LoadLDSTile: A A A A, B B B B // After removing 2x1 jamming: A A A A, B B - CHECK(graph0.control.getElements().to().size() == 8); - CHECK(graph1.control.getElements().to().size() == 6); + CHECK(graph0.control.getElements().to().size() == 4); + CHECK(graph1.control.getElements().to().size() == 4); } diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp index c0271b56..5e1b4adf 100644 --- a/test/common/CommonGraphs.cpp +++ b/test/common/CommonGraphs.cpp @@ -193,4 +193,243 @@ namespace rocRollerTest::Graphs return params; } + GEMM::GEMM(DataType ta) + : GEMM(ta, ta) + { + } + GEMM::GEMM(DataType ta, DataType tb) + : GEMM(ta, tb, tb) + { + } + GEMM::GEMM(DataType ta, DataType tb, DataType tc) + : GEMM(ta, tb, tc, tc) + { + } + GEMM::GEMM(DataType ta, DataType tb, DataType tc, DataType td) + : mTa(ta) + , mTb(tb) + , mTc(tc) + , mTd(td) + { + } + + void GEMM::createCommand() + { + m_command = std::make_shared(); + + std::vector oneStridesN + = m_problem.literalStrides ? std::vector({(size_t)1}) : std::vector({}); + + std::vector oneStridesT = m_problem.literalStrides + ? std::vector({(size_t)0, (size_t)1}) + : std::vector({}); + + mTagTensorA = m_command->addOperation(rocRoller::Operations::Tensor( + 2, mTa, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A + + m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorA)); + + mTagTensorB = m_command->addOperation(rocRoller::Operations::Tensor( + 2, mTb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B + m_tagB = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorB)); + + mTagTensorC + = m_command->addOperation(rocRoller::Operations::Tensor(2, mTc, oneStridesN)); // C + m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorC)); + + mTagScalarAlpha + = m_command->addOperation(rocRoller::Operations::Scalar(DataType::Float)); // alpha + auto tagLoadAlpha + = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarAlpha)); + + mTagScalarBeta = m_command->addOperation(rocRoller::Operations::Scalar(mTc)); // beta + auto tagLoadBeta + = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarBeta)); // beta + + auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B + + rocRoller::Operations::T_Execute execute(m_command->getNextTag()); + + auto tagBetaC + = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadBeta, m_tagC)); // beta * C + + auto tagAlphaAB + = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadAlpha, tagAB)); // alpha * (A * B) + + if(m_problem.betaInFma) + { + m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagBetaC, tagAlphaAB)); + // alpha * (A * B) + beta * C + } + else + { + m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagAlphaAB, tagBetaC)); + // alpha * (A * B) + beta * C + } + m_command->addOperation(std::move(execute)); + + mTagTensorD + = m_command->addOperation(rocRoller::Operations::Tensor(2, mTd, oneStridesN)); // D + m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, mTagTensorD)); // D + + if(m_problem.streamK) + { + mTagNumWGs = m_command->allocateTag(); + auto numWGsArg = m_command->allocateArgument(DataType::UInt32, + mTagNumWGs, + ArgumentType::Value, + DataDirection::ReadOnly, + rocRoller::NUMWGS); + } + + mTagScratch = m_command->allocateTag(); + m_command->allocateArgument(VariableType(DataType::UInt32, PointerType::PointerGlobal), + mTagScratch, + ArgumentType::Value, + DataDirection::ReadWrite, + rocRoller::SCRATCH); + } + + CommandPtr GEMM::getCommand() + { + if(!m_command) + createCommand(); + + return m_command; + } + + KernelGraph GEMM::getKernelGraph() + { + return rocRoller::KernelGraph::translate(getCommand()); + } + + void GEMM::setTileSize(int m, int n, int k) + { + m_problem.macM = m; + m_problem.macN = n; + m_problem.macK = k; + } + + void GEMM::setMFMA(int m, int n, int k, int b) + { + m_problem.waveM = m; + m_problem.waveN = n; + m_problem.waveK = k; + m_problem.waveB = b; + } + + void GEMM::setUseLDS(bool a, bool b, bool d) + { + m_problem.loadLDSA = a; + m_problem.loadLDSB = b; + m_problem.storeLDSD = d; + } + + void GEMM::setPrefetch(bool prefetch, + int prefetchInFlight, + int prefetchLDSFactor, + bool prefetchMixMemOps) + { + m_problem.prefetch = prefetch; + m_problem.prefetchInFlight = prefetchInFlight; + m_problem.prefetchLDSFactor = prefetchLDSFactor; + m_problem.prefetchMixMemOps = prefetchMixMemOps; + + m_problem.unrollK = prefetchInFlight; + } + + void GEMM::setProblem(GEMMProblem const& problem) + { + m_problem = problem; + } + + GEMMProblem const& GEMM::getProblem() const + { + return m_problem; + } + + CommandParametersPtr GEMM::getCommandParameters() const + { + using namespace rocRoller::KernelGraph::CoordinateGraph; + + auto params = std::make_shared(); + + params->setManualKernelDimension(2); + + AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0, + "Workgroup Size X must be multiply of wave front size"); + + uint wavetilePerWavefrontM + = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX; + uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY; + + AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0, + "WaveTile size mismatch (M)"); + AssertFatal(m_problem.macN % (m_problem.waveN * wavetilePerWavefrontN) == 0, + "WaveTile size mismatch (N)"); + + uint workgroupSizeX = m_problem.workgroupSizeX * m_problem.workgroupSizeY; + uint workgroupSizeY = 1; + params->setManualWorkgroupSize({workgroupSizeX, workgroupSizeY, 1}); + + auto macTileA + = MacroTile({m_problem.macM, m_problem.macK}, + LayoutType::MATRIX_A, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}, + m_problem.loadLDSA ? MemoryType::LDS : MemoryType::WAVE); + auto macTileB + = MacroTile({m_problem.macK, m_problem.macN}, + LayoutType::MATRIX_B, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}, + m_problem.loadLDSB ? MemoryType::LDS : MemoryType::WAVE); + auto macTileC + = MacroTile({m_problem.macM, m_problem.macN}, + LayoutType::MATRIX_ACCUMULATOR, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}); + auto macTileD + = MacroTile({m_problem.macM, m_problem.macN}, + LayoutType::MATRIX_ACCUMULATOR, + {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB}, + m_problem.storeLDSD ? MemoryType::LDS : MemoryType::WAVE); + + params->setDimensionInfo(m_tagA, macTileA); + params->setDimensionInfo(m_tagB, macTileB); + params->setDimensionInfo(m_tagC, macTileC); + params->setDimensionInfo(m_tagD, macTileD); + + // uint jammedM + // = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX; + // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY; + + Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1); + // Log::debug("GEMM jamming {} {}", jammedM, jammedN); + // params->setWaveTilesPerWavefront(jammedM, jammedN); + + params->setManualWavefrontCount( + {static_cast(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM), + static_cast(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)}); + + params->fuseLoops = m_problem.fuseLoops; + params->tailLoops = m_problem.tailLoops; + params->allowAmbiguousMemoryNodes = m_problem.allowAmbiguousMemoryNodes; + params->unrollK = m_problem.unrollK; + params->packMultipleElementsInto1VGPR = m_problem.packMultipleElementsInto1VGPR; + params->prefetch = m_problem.prefetch; + params->prefetchInFlight = m_problem.prefetchInFlight; + params->prefetchLDSFactor = m_problem.prefetchLDSFactor; + params->prefetchMixMemOps = m_problem.prefetchMixMemOps; + params->transposeMemoryAccess[LayoutType::MATRIX_A] = m_problem.transA == "T"; + params->transposeMemoryAccess[LayoutType::MATRIX_B] = m_problem.transB == "T"; + params->transposeMemoryAccess[LayoutType::None] = true; + + if(m_problem.streamK) + { + params->loopOverOutputTilesDimensions = {0, 1}; + params->streamK = true; + params->streamKTwoTile = m_problem.streamKTwoTile; + } + + return params; + } + } diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp index 4ee7a3a4..03e902c8 100644 --- a/test/common/common/CommonGraphs.hpp +++ b/test/common/common/CommonGraphs.hpp @@ -40,6 +40,8 @@ #include #include +#include + namespace rocRollerTest { namespace Graphs @@ -52,6 +54,7 @@ namespace rocRollerTest using ContextPtr = rocRoller::ContextPtr; using KernelArguments = rocRoller::KernelArguments; using KernelGraph = rocRoller::KernelGraph::KernelGraph; + using DataType = rocRoller::DataType; /** * @brief Graph for linear: alpha x + beta y. @@ -178,11 +181,13 @@ namespace rocRollerTest * - Assign(D = alpha * AB + beta * C) * - StoreTiled(D) */ - template class GEMM { public: - GEMM(); + GEMM(DataType ta); + GEMM(DataType ta, DataType tb); + GEMM(DataType ta, DataType tb, DataType tc); + GEMM(DataType ta, DataType tb, DataType tc, DataType td); CommandPtr getCommand(); KernelGraph getKernelGraph(); @@ -190,9 +195,21 @@ namespace rocRollerTest void setTileSize(int m, int n, int k); void setMFMA(int m, int n, int k, int b); void setUseLDS(bool a, bool b, bool d); + void setPrefetch(bool prefetch, + int prefetchInFlight, + int prefetchLDSFactor, + bool prefetchMixMemOps); + void setProblem(GEMMProblem const& problem); + GEMMProblem const& getProblem() const; CommandParametersPtr getCommandParameters() const; + rocRoller::Operations::OperationTag mTagTensorA, mTagTensorB, mTagTensorC, mTagTensorD, + mTagScalarAlpha, mTagScalarBeta, mTagScalarSeed, mTagScratch; + rocRoller::Operations::OperationTag mTagNumWGs; + + DataType mTa, mTb, mTc, mTd; + private: void createCommand(); @@ -202,7 +219,8 @@ namespace rocRollerTest rocRoller::Operations::OperationTag m_tagA, m_tagB, m_tagC, m_tagD; - CommandPtr m_command; + CommandPtr m_command; + GEMMProblem m_problem; }; /** diff --git a/test/common/common/CommonGraphs_impl.hpp b/test/common/common/CommonGraphs_impl.hpp index ec097436..9a8d6201 100644 --- a/test/common/common/CommonGraphs_impl.hpp +++ b/test/common/common/CommonGraphs_impl.hpp @@ -256,132 +256,6 @@ namespace rocRollerTest::Graphs return rocRoller::KernelGraph::translate(m_command); } - /* - * GEMM - */ - - template - GEMM::GEMM() - { - createCommand(); - } - - template - void GEMM::createCommand() - { - m_command = std::make_shared(); - - auto dataType = TypeInfo::Var.dataType; - - auto tagTensorA = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // A - m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(tagTensorA)); - - auto tagTensorB = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // B - m_tagB = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(tagTensorB)); - - auto tagTensorC = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // C - m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(tagTensorC)); - - auto tagScalarAlpha - = m_command->addOperation(rocRoller::Operations::Scalar(dataType)); // alpha - auto tagLoadAlpha - = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(tagScalarAlpha)); - - auto tagScalarBeta - = m_command->addOperation(rocRoller::Operations::Scalar(dataType)); // beta - auto tagLoadBeta - = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(tagScalarBeta)); // beta - - auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B - - rocRoller::Operations::T_Execute execute(m_command->getNextTag()); - auto tagAlphaAB - = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadAlpha, tagAB)); // alpha * (A * B) - auto tagBetaC - = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadBeta, m_tagC)); // beta * C - m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagAlphaAB, tagBetaC)); - // alpha * (A * B) + beta * C - m_command->addOperation(std::move(execute)); - - auto tagTensorD = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // D - m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, tagTensorD)); // D - } - - template - CommandPtr GEMM::getCommand() - { - return m_command; - } - - template - KernelGraph GEMM::getKernelGraph() - { - return rocRoller::KernelGraph::translate(m_command); - } - - template - void GEMM::setTileSize(int m, int n, int k) - { - m_macM = m; - m_macN = n; - m_macK = k; - } - - template - void GEMM::setMFMA(int m, int n, int k, int b) - { - m_waveM = m; - m_waveN = n; - m_waveK = k; - m_waveB = b; - } - - template - void GEMM::setUseLDS(bool a, bool b, bool d) - { - m_useLDSA = a; - m_useLDSB = b; - m_useLDSD = d; - } - - template - CommandParametersPtr GEMM::getCommandParameters() const - { - using namespace rocRoller::KernelGraph::CoordinateGraph; - - auto params = std::make_shared(); - - auto macTileA = MacroTile({m_macM, m_macK}, - LayoutType::MATRIX_A, - {m_waveM, m_waveN, m_waveK, m_waveB}, - m_useLDSA ? MemoryType::LDS : MemoryType::WAVE); - auto macTileB = MacroTile({m_macK, m_macN}, - LayoutType::MATRIX_B, - {m_waveM, m_waveN, m_waveK, m_waveB}, - m_useLDSB ? MemoryType::LDS : MemoryType::WAVE); - auto macTileC = MacroTile( - {m_macM, m_macN}, LayoutType::MATRIX_ACCUMULATOR, {m_waveM, m_waveN, m_waveK, m_waveB}); - - params->setDimensionInfo(m_tagA, macTileA); - params->setDimensionInfo(m_tagB, macTileB); - params->setDimensionInfo(m_tagC, macTileC); - - // Workgroup size - uint wavefrontSize = 64; - uint workgroupSizeX = 2 * wavefrontSize; - uint workgroupSizeY = 4; - - uint jammedM = wavefrontSize * m_macM / m_waveM / workgroupSizeX; - uint jammedN = m_macN / m_waveN / workgroupSizeY; - - Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1); - Log::debug("GEMM jamming {} {}", jammedM, jammedN); - - params->setWaveTilesPerWavefront(jammedM, jammedN); - - return params; - } - /* * TileDoubleAdd */ diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp index 8533b907..edfd3e01 100644 --- a/test/unit/KernelGraphTest.cpp +++ b/test/unit/KernelGraphTest.cpp @@ -1085,7 +1085,7 @@ namespace KernelGraphTest TEST_F(KernelGraphTest, LowerTensor) { - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); int macK = 16; int waveK = 8; @@ -1135,7 +1135,7 @@ namespace KernelGraphTest // Verify that loops have been unrolled auto unrolledForLoops = kgraphUnrolled.control.getNodes().to(); - EXPECT_EQ(unrolledForLoops.size(), 10); // main: X (Y (K K)) (Y (K K)); epilogue: X (Y Y) + EXPECT_EQ(unrolledForLoops.size(), 5); // main: X (Y (K K)) (Y (K K)); epilogue: X (Y Y) auto kgraphFused = kgraphUnrolled.transform(fuseLoopsTransform); kgraphFused = kgraphFused.transform(removeDuplicatesTransform); @@ -1145,7 +1145,7 @@ namespace KernelGraphTest EXPECT_EQ(fusedForLoops.size(), 5); auto fusedLoads = kgraphFused.control.getNodes().to(); - EXPECT_EQ(fusedLoads.size(), 9); // 1 for A, 4 for B, 4 for C + EXPECT_EQ(fusedLoads.size(), 4); // 1 for A, 4 for B, 4 for C // Verify that single iteration loops have been removed. auto kgraphClean = kgraphFused.transform(cleanLoopsTransform); @@ -1154,13 +1154,13 @@ namespace KernelGraphTest // Verify that there is only a single StoreLDSTile node per K loop auto unrolledStoreLDS = kgraphUnrolled.control.getNodes().to(); - EXPECT_EQ(unrolledStoreLDS.size(), 4); + EXPECT_EQ(unrolledStoreLDS.size(), 1); // Verify number of ComputeIndexes: A loads; A LDS loads; B loads; C load; D // store: 3 + (2+2) + 3 + 3 + 3 = 12 kgraph1 = kgraph1.transform(addComputeIndexTransform); auto computeIndexes = kgraph1.control.getNodes().to(); - EXPECT_EQ(computeIndexes.size(), 16); + EXPECT_EQ(computeIndexes.size(), 15); // Verify number of Deallocates auto addDeallocate = std::make_shared(); @@ -1169,7 +1169,7 @@ namespace KernelGraphTest EXPECT_EQ(addDeallocates.size(), 16); auto storeLDS = kgraphUnrolled.control.getNodes().to(); - EXPECT_EQ(storeLDS.size(), 4); + EXPECT_EQ(storeLDS.size(), 1); auto fusedStoreLDS = kgraphFused.control.getNodes().to(); EXPECT_EQ(fusedStoreLDS.size(), 1); @@ -1177,7 +1177,7 @@ namespace KernelGraphTest TEST_F(KernelGraphTest, InlineIncrement) { - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); example.setTileSize(128, 256, 8); example.setMFMA(32, 32, 2, 1); @@ -2717,7 +2717,7 @@ namespace KernelGraphTest { using GD = Graph::Direction; - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); example.setTileSize(128, 256, 8); example.setMFMA(32, 32, 2, 1); diff --git a/test/unit/KernelGraphTest/UpdateParameters.cpp b/test/unit/KernelGraphTest/UpdateParameters.cpp index dd101c5c..e72f9960 100644 --- a/test/unit/KernelGraphTest/UpdateParameters.cpp +++ b/test/unit/KernelGraphTest/UpdateParameters.cpp @@ -88,7 +88,7 @@ namespace KernelGraphTest { using namespace rocRoller::KernelGraph; - auto example = rocRollerTest::Graphs::GEMM(); + auto example = rocRollerTest::Graphs::GEMM(DataType::Float); int macK = 16; int waveK = 8; @@ -118,16 +118,16 @@ namespace KernelGraphTest // Now apply SetWorkitemCount and try again kgraph = kgraph.transform(std::make_shared(m_context)); - CommandArgumentPtr tensorDsizeX; + CommandArgumentPtr tensorAsizeX; { auto arguments = command->getArguments(); for(auto argument : arguments) { - if(argument->name() == "Tensor_4_size_0") - tensorDsizeX = argument; + if(argument->name() == "Tensor_0_size_0") + tensorAsizeX = argument; } } - ASSERT_NE(tensorDsizeX, nullptr) << "D size not found"; + ASSERT_NE(tensorAsizeX, nullptr) << "A size not found"; workitemCount = m_context->kernel()->workitemCount(); @@ -135,7 +135,7 @@ namespace KernelGraphTest auto workgroupSizeX = Expression::literal(128u); auto expected - = (((tensorDsizeX->expression() + workgroupSizeX) - one) / workgroupSizeX) * one; + = (((tensorAsizeX->expression() + workgroupSizeX) - one) / workgroupSizeX) * one; EXPECT_TRUE(Expression::identical(expected, workitemCount[0])); } From fada98a070ec03799500b26c2c95d0c59daee827 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 12 Mar 2025 21:50:38 +0000 Subject: [PATCH 15/32] Adding jamm factors to CommonGraph --- .../catch/KernelGraphRemoveDuplicatesTest.cpp | 2 +- test/common/CommonGraphs.cpp | 22 ++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp index df871c0b..595793f0 100644 --- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp +++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp @@ -44,7 +44,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]") auto ctx = TestContext::ForDefaultTarget().get(); auto example = rocRollerTest::Graphs::GEMM(DataType::Float); - example.setTileSize(128, 128, 32); + example.setTileSize(128, 64, 32); example.setMFMA(32, 32, 16, 1); example.setUseLDS(true, true, false); diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp index 5e1b4adf..248d98d9 100644 --- a/test/common/CommonGraphs.cpp +++ b/test/common/CommonGraphs.cpp @@ -359,9 +359,16 @@ namespace rocRollerTest::Graphs AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0, "Workgroup Size X must be multiply of wave front size"); + // i.e. jammedM uint wavetilePerWavefrontM = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX; + AssertFatal(wavetilePerWavefrontM != 0, + "Wavetiles per wavefront in M should be positive integer"); + + // i.e. jammedN uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY; + AssertFatal(wavetilePerWavefrontN != 0, + "Wavetiles per wavefront in N should be positive integer"); AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0, "WaveTile size mismatch (M)"); @@ -397,14 +404,19 @@ namespace rocRollerTest::Graphs params->setDimensionInfo(m_tagC, macTileC); params->setDimensionInfo(m_tagD, macTileD); - // uint jammedM - // = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX; - // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY; +#if 0 + uint jammedM + = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX; + uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY; Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1); - // Log::debug("GEMM jamming {} {}", jammedM, jammedN); - // params->setWaveTilesPerWavefront(jammedM, jammedN); + Log::debug("GEMM jamming {} {}", jammedM, jammedN); + params->setWaveTilesPerWavefront(jammedM, jammedN); +#endif + Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1); + Log::debug("GEMM jamming {} {}", wavetilePerWavefrontM, wavetilePerWavefrontN); + params->setWaveTilesPerWavefront(wavetilePerWavefrontM, wavetilePerWavefrontN); params->setManualWavefrontCount( {static_cast(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM), static_cast(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)}); From 86d0b729d3b766ba0a236d9d204e6b911b9f0517 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 12 Mar 2025 22:36:44 +0000 Subject: [PATCH 16/32] Chaniging macro tile size for a unit test to match jamming factors after changes in GEMM graph --- test/unit/KernelGraphTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp index d14e95a4..aa31ef94 100644 --- a/test/unit/KernelGraphTest.cpp +++ b/test/unit/KernelGraphTest.cpp @@ -1090,7 +1090,7 @@ namespace KernelGraphTest int macK = 16; int waveK = 8; - example.setTileSize(128, 256, macK); + example.setTileSize(128, 128, macK); example.setMFMA(32, 32, waveK, 1); example.setUseLDS(true, false, false); From a6c9f49de30e548cb1d88623af500e7050220591 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 12 Mar 2025 23:27:03 +0000 Subject: [PATCH 17/32] Restore jamming factors in common GEMM graph and restore original tests --- .../catch/KernelGraphRemoveDuplicatesTest.cpp | 12 ++++---- test/common/CommonGraphs.cpp | 29 +++++++++++++++---- test/common/common/CommonGraphs.hpp | 4 --- test/unit/KernelGraphTest.cpp | 12 ++++---- 4 files changed, 35 insertions(+), 22 deletions(-) diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp index 01812d2a..595793f0 100644 --- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp +++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp @@ -44,7 +44,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]") auto ctx = TestContext::ForDefaultTarget().get(); auto example = rocRollerTest::Graphs::GEMM(DataType::Float); - example.setTileSize(128, 128, 32); + example.setTileSize(128, 64, 32); example.setMFMA(32, 32, 16, 1); example.setUseLDS(true, true, false); @@ -72,16 +72,16 @@ TEST_CASE("Remove duplicates", "[kernel-graph]") // LoadTiled: A A, B B, C C // After removing 2x1 jamming: A, B, C C - CHECK(graph0.control.getElements().to().size() == 3); - CHECK(graph1.control.getElements().to().size() == 3); + CHECK(graph0.control.getElements().to().size() == 6); + CHECK(graph1.control.getElements().to().size() == 4); // StoreLDSTile: A A, B B // After removing 2x1 jamming: A, B - CHECK(graph0.control.getElements().to().size() == 2); + CHECK(graph0.control.getElements().to().size() == 4); CHECK(graph1.control.getElements().to().size() == 2); // LoadLDSTile: A A A A, B B B B // After removing 2x1 jamming: A A A A, B B - CHECK(graph0.control.getElements().to().size() == 4); - CHECK(graph1.control.getElements().to().size() == 4); + CHECK(graph0.control.getElements().to().size() == 8); + CHECK(graph1.control.getElements().to().size() == 6); } diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp index 5e1b4adf..644b62dd 100644 --- a/test/common/CommonGraphs.cpp +++ b/test/common/CommonGraphs.cpp @@ -356,12 +356,33 @@ namespace rocRollerTest::Graphs params->setManualKernelDimension(2); + AssertFatal(m_problem.m % m_problem.macM == 0, + "MacroTile size mismatch (M)", + ShowValue(m_problem.m), + ShowValue(m_problem.macM)); + AssertFatal(m_problem.n % m_problem.macN == 0, + "MacroTile size mismatch (N)", + ShowValue(m_problem.n), + ShowValue(m_problem.macN)); + AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0, "Workgroup Size X must be multiply of wave front size"); + AssertFatal(m_problem.macM % m_problem.waveM == 0, + "Macrotile size must be a multiple of wavetile size"); + AssertFatal(m_problem.macN % m_problem.waveN == 0, + "Macrotile size must be a multiple of wavetile size"); + + // i.e. jammedM uint wavetilePerWavefrontM = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX; + AssertFatal(wavetilePerWavefrontM > 0, + "Wavetiles per wavefront in M should be positive integer"); + + // i.e. jammedN uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY; + AssertFatal(wavetilePerWavefrontN > 0, + "Wavetiles per wavefront in N should be positive integer"); AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0, "WaveTile size mismatch (M)"); @@ -397,14 +418,10 @@ namespace rocRollerTest::Graphs params->setDimensionInfo(m_tagC, macTileC); params->setDimensionInfo(m_tagD, macTileD); - // uint jammedM - // = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX; - // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY; - Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1); - // Log::debug("GEMM jamming {} {}", jammedM, jammedN); - // params->setWaveTilesPerWavefront(jammedM, jammedN); + Log::debug("GEMM jamming {} {}", wavetilePerWavefrontM, wavetilePerWavefrontN); + params->setWaveTilesPerWavefront(wavetilePerWavefrontM, wavetilePerWavefrontN); params->setManualWavefrontCount( {static_cast(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM), static_cast(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)}); diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp index 03e902c8..ddb4b816 100644 --- a/test/common/common/CommonGraphs.hpp +++ b/test/common/common/CommonGraphs.hpp @@ -213,10 +213,6 @@ namespace rocRollerTest private: void createCommand(); - int m_macM, m_macN, m_macK; - int m_waveM, m_waveN, m_waveK, m_waveB; - bool m_useLDSA = false, m_useLDSB = false, m_useLDSD = false; - rocRoller::Operations::OperationTag m_tagA, m_tagB, m_tagC, m_tagD; CommandPtr m_command; diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp index edfd3e01..aa31ef94 100644 --- a/test/unit/KernelGraphTest.cpp +++ b/test/unit/KernelGraphTest.cpp @@ -1090,7 +1090,7 @@ namespace KernelGraphTest int macK = 16; int waveK = 8; - example.setTileSize(128, 256, macK); + example.setTileSize(128, 128, macK); example.setMFMA(32, 32, waveK, 1); example.setUseLDS(true, false, false); @@ -1135,7 +1135,7 @@ namespace KernelGraphTest // Verify that loops have been unrolled auto unrolledForLoops = kgraphUnrolled.control.getNodes().to(); - EXPECT_EQ(unrolledForLoops.size(), 5); // main: X (Y (K K)) (Y (K K)); epilogue: X (Y Y) + EXPECT_EQ(unrolledForLoops.size(), 10); // main: X (Y (K K)) (Y (K K)); epilogue: X (Y Y) auto kgraphFused = kgraphUnrolled.transform(fuseLoopsTransform); kgraphFused = kgraphFused.transform(removeDuplicatesTransform); @@ -1145,7 +1145,7 @@ namespace KernelGraphTest EXPECT_EQ(fusedForLoops.size(), 5); auto fusedLoads = kgraphFused.control.getNodes().to(); - EXPECT_EQ(fusedLoads.size(), 4); // 1 for A, 4 for B, 4 for C + EXPECT_EQ(fusedLoads.size(), 9); // 1 for A, 4 for B, 4 for C // Verify that single iteration loops have been removed. auto kgraphClean = kgraphFused.transform(cleanLoopsTransform); @@ -1154,13 +1154,13 @@ namespace KernelGraphTest // Verify that there is only a single StoreLDSTile node per K loop auto unrolledStoreLDS = kgraphUnrolled.control.getNodes().to(); - EXPECT_EQ(unrolledStoreLDS.size(), 1); + EXPECT_EQ(unrolledStoreLDS.size(), 4); // Verify number of ComputeIndexes: A loads; A LDS loads; B loads; C load; D // store: 3 + (2+2) + 3 + 3 + 3 = 12 kgraph1 = kgraph1.transform(addComputeIndexTransform); auto computeIndexes = kgraph1.control.getNodes().to(); - EXPECT_EQ(computeIndexes.size(), 15); + EXPECT_EQ(computeIndexes.size(), 16); // Verify number of Deallocates auto addDeallocate = std::make_shared(); @@ -1169,7 +1169,7 @@ namespace KernelGraphTest EXPECT_EQ(addDeallocates.size(), 16); auto storeLDS = kgraphUnrolled.control.getNodes().to(); - EXPECT_EQ(storeLDS.size(), 1); + EXPECT_EQ(storeLDS.size(), 4); auto fusedStoreLDS = kgraphFused.control.getNodes().to(); EXPECT_EQ(fusedStoreLDS.size(), 1); From 0f16b1f82f15e6e4a2100a231d87f8cc680a343f Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 13 Mar 2025 00:33:28 +0000 Subject: [PATCH 18/32] Adding filename in a correct CMakeLists.txt --- test/common/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/common/CMakeLists.txt b/test/common/CMakeLists.txt index 0a2987f6..b25a2828 100644 --- a/test/common/CMakeLists.txt +++ b/test/common/CMakeLists.txt @@ -34,6 +34,7 @@ add_library( CommonGraphs.cpp mxDataGen.cpp Utilities.cpp + WidenTo64bit.cpp ) target_compile_options( From 6ab41fc0b641363f20fa017c09a2ffacb5054c2a Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 13 Mar 2025 03:03:49 +0000 Subject: [PATCH 19/32] For modulo/divide the second operand should be a leaf value --- test/common/WidenTo64bit.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/common/WidenTo64bit.cpp b/test/common/WidenTo64bit.cpp index 7758628c..dfb21645 100644 --- a/test/common/WidenTo64bit.cpp +++ b/test/common/WidenTo64bit.cpp @@ -70,6 +70,7 @@ namespace rocRollerTest { // TODO: Add check that divisor is a CValue within (u)int32-bit Log::debug("Divisor: {} ", toString(expr.rhs)); + AssertFatal(isExpectedLeafType(expr.rhs)); Expression::Divide cpy = expr; if(expr.lhs) @@ -82,6 +83,7 @@ namespace rocRollerTest { // TODO: Add check that divisor is a CValue within (u)int32-bit Log::debug("Modulo: {} ", toString(expr.rhs)); + AssertFatal(isExpectedLeafType(expr.rhs)); Expression::Modulo cpy = expr; if(expr.lhs) @@ -243,6 +245,14 @@ namespace rocRollerTest "workgroup/item indices ", showValue); } + + bool isExpectedLeafType(Expression::ExpressionPtr const& expr) const { + return std::holds_alternative(*expr) || + std::holds_alternative(*expr) || + std::holds_alternative(*expr) || + std::holds_alternative(*expr); + } + }; Expression::ExpressionPtr widenTo64bit(Expression::ExpressionPtr expr) From b827c064e1ae58181e74505eb78f536297596fb7 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 13 Mar 2025 19:48:19 +0000 Subject: [PATCH 20/32] Adding license --- test/catch/AddressCalculationTest.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 49aa0fc1..4e415853 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -1,3 +1,28 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright 2025 AMD ROCm(TM) Software + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ #include "CustomMatchers.hpp" #include "TestContext.hpp" From bbb69a1cbb0175b2ef43bbe92485a6efae499e63 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Fri, 14 Mar 2025 20:52:48 +0000 Subject: [PATCH 21/32] Introducing GENERATE/DYNAMIC_SECTION and adding passing configurations --- .../InstructionValues/Register_impl.hpp | 3 + .../FastDivision.cpp | 6 + test/catch/AddressCalculationTest.cpp | 117 +++++++++++++----- test/common/common/TestValues.hpp | 17 ++- 4 files changed, 106 insertions(+), 37 deletions(-) diff --git a/lib/include/rocRoller/InstructionValues/Register_impl.hpp b/lib/include/rocRoller/InstructionValues/Register_impl.hpp index 1ecc5697..14b1c5f8 100644 --- a/lib/include/rocRoller/InstructionValues/Register_impl.hpp +++ b/lib/include/rocRoller/InstructionValues/Register_impl.hpp @@ -817,6 +817,9 @@ namespace rocRoller std::vector coords; for(auto i : indices) { + if (i < 0 || (size_t)i >= m_allocationCoord.size()) { + Log::debug("FA i {}", i); + } AssertFatal(i >= 0 && (size_t)i < m_allocationCoord.size(), "Register subset out of bounds.", ShowValue(m_allocationCoord.size()), diff --git a/lib/source/ExpressionTransformations/FastDivision.cpp b/lib/source/ExpressionTransformations/FastDivision.cpp index c5af98e4..aeef0631 100644 --- a/lib/source/ExpressionTransformations/FastDivision.cpp +++ b/lib/source/ExpressionTransformations/FastDivision.cpp @@ -329,6 +329,9 @@ namespace rocRoller { unsigned int mask = rhs - 1u; auto new_rhs = literal(mask); +#if 1 + Log::debug("Yoonseo - umodulo - name {} toString {} mask {}", name(new_rhs), toString(new_rhs), mask); +#endif return lhs & new_rhs; } @@ -342,6 +345,9 @@ namespace rocRoller int mask = ~(rhs - 1); auto maskExpr = literal(mask); +#if 1 + Log::debug("Yoonseo - smodulo - name {} toString {} mask {}", name(maskExpr), toString(maskExpr), mask); +#endif auto signBitsExpr = literal(signBits); auto reverseShiftAmountExpr = literal(reverseShiftAmount); diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 4e415853..b7e57a07 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -25,15 +25,19 @@ *******************************************************************************/ #include "CustomMatchers.hpp" +#include "CustomSections.hpp" #include "TestContext.hpp" + #include #include #include #include #include + #include #include #include +#include #include #include #include @@ -423,6 +427,7 @@ namespace AddressCalculationTest // Compute the value 2 Register::ValuePtr v_value_2 = nullptr; + co_yield Expression::generate(v_value_2, widenedInput, context); co_yield context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8); @@ -445,7 +450,11 @@ namespace AddressCalculationTest // Note that actually large matrix hipMalloc is not needed. // But at the same time, larger malloc may incurr larger base address, which can lead to overflow. - // ?? how to get float from m_dt? gemm.m_ta? + // + // TODO: How to get "float" from DataType of gemm.tA ? DataType::Float --> "float" + // (An opposite way is TypeInfo::Var.dataType). + // Might be alright to use float everytime. In the end, we never uses the allocated values. + // auto deviceA = make_shared_device(); auto deviceB = make_shared_device(); auto deviceC = make_shared_device(); @@ -688,14 +697,16 @@ namespace AddressCalculationTest indexExprPtrs = AddressTrace(m_commandKernel->getKernelGraph(), m_context) .traceComputeIndexWithBuffer(); std::vector widenedExprPtrs; - for(int i = 0, size = indexExprPtrs.size(); i < size; i++) - { - auto eptr = indexExprPtrs[i]; - Log::debug("== Expr : {} ", toString(eptr)); - widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr)); - Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); - } + // Test only one pair of expressions (the first pair.) + auto eptr = indexExprPtrs[0]; + Log::debug("== Expr : {} ", toString(eptr)); + + widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr)); + Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); + + auto fast = Expression::FastArithmetic(m_context); + Log::debug("** fast : {} ", toString(fast(widenedExprPtrs.back()))); auto k = m_context->kernel(); m_context->schedule(kb_equal_one( @@ -766,9 +777,6 @@ namespace AddressCalculationTest co_yield Expression::generate( v_offset, compare_res_pointer + s_ptr->expression(), m_context); - // may not be needed. - //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug"))); - // boolean_diff was allocated to s[0:1] // diff should be computed per lane, // but is_zero_diff is actually one-bit @@ -799,7 +807,7 @@ namespace AddressCalculationTest toString(boolType)); } - // boolean_true = boolean_true | is_zero_diff + // boolean_true = boolean_true & is_zero_diff auto accumRes = std::make_shared(Expression::BitwiseAnd{ boolean_true->expression(), is_zero_diff, "accum"}); @@ -857,28 +865,73 @@ namespace AddressCalculationTest TEST_CASE("address calculation test generate and run", "[expression][gpu]") { - // Single here means applied to all three A, B, C matrices. - // TODO: Add more dataTypes - std::vector singleDataTypes = {DataType::Float}; - - for(auto dataType : singleDataTypes) + // Noticed that for "float" type, all different combinations of + // problem sizes (m, n, k) by macro tile sizes (macM, macN) + // will generate the same kernel instructions. + // This is because address calculation expressions use the same arithmetic + // over the same command arguments and workgroup indices, workitem indices. + // Only the vales of the command arguments and workgroup, workitem indices change. + + // Also noticed that current expressions obtained from generation of computeIndex's + // VGPR base part of buffer_ instructions uses only following command arguments. + // Tensor_15_stride_1_14 (D's m or n) + // Tensor_4_stride_1_13 (C's m or n) + // Tensor_0_stride_1_10 (A's m ) + // Tensor_2_stride_0_11 (B's n) + // No problem size "k" is involved in the expression. + // This can be because current expressions are only base VGPR address of buffer_ + // instructions. The "k" part might be applied as increment. + // Or, it could be simply a bug. + + // Called single as the one data type is applied to all A, B, C and D matrices. + // TODO: Add more dataTypes. Also notice other TODO in the function "setTensorArguments()", + // where device pointer for matrices are allocated. Currently, only "float" is used. + // TODO: Debug. With DataType::Double, the test fails. + // The bug is from generating a widened expression. + // When it was generated, fast modulo introduces BitwiseAnd operations. + // Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa. + // Bitwise expression generator doesn't promote operands' datatype whereas + // other binary arithmetic operations, e.g. Add, do the regType/dataType promotion. + auto singleDataType = GENERATE(DataType::Float); + //CAPTURE(singleDataType); + //INFO("s" << singleDataType); + std::cout << "singleType: " << singleDataType << "\n"; + DYNAMIC_SECTION(singleDataType) { - for(auto [m, n, macM, macN] : TestValues::gemmProblemSizes) + auto [m, n, k] + = GENERATE(values(TestValues::gemmProblemSizes)); + + std::cout << "problemSize m: " << m << "\n"; + std::cout << "problemSize n: " << n << "\n"; + std::cout << "problemSize k: " << k << "\n"; + DYNAMIC_SECTION("ps_" << m << "x" << n << "x" << k) { - // Come up with a string from problem_size and data type, to be given to ForTestDevice(); - auto suffixForKernelName = std::to_string(m) + "x" + std::to_string(n) + "_" - + std::to_string(macM) + "_" + std::to_string(macN); - auto context = TestContext::ForTestDevice({}, suffixForKernelName); - - GEMMProblem problem{.m = m, .n = n, .macM = macM, .macN = macN}; - rocRollerTest::Graphs::GEMM gemm(dataType); - gemm.setProblem(problem); - CAPTURE(dataType, m, n, macM, macN); - - AddressCalculationTest kernel(context.get(), problem, gemm); - // Generate a kernel for testing address calculation and run. - // Verification of the result is done. - kernel.test_equal(); + auto [macM, macN] = GENERATE(values(TestValues::macroTileSizes)); + DYNAMIC_SECTION("mc_" << macM << "x" << macN) + { + //CAPTURE(problemSize); + //INFO("p " << problemSize); + std::cout << "macM: " << macM << "\n"; + std::cout << "macN: " << macN << "\n"; + + // Come up with a string from problem_size and data type, to be given to ForTestDevice(); + auto probSizeString + = std::to_string(m) + "x" + std::to_string(n) + "x" + std::to_string(k); + auto macroTileString = std::to_string(macM) + "x" + std::to_string(macN); + auto suffixForKernelName + = toString(singleDataType) + "_" + probSizeString + "_" + macroTileString; + auto context = TestContext::ForTestDevice({}, suffixForKernelName); + + GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN}; + rocRollerTest::Graphs::GEMM gemm(singleDataType); + gemm.setProblem(problem); + + AddressCalculationTest kernel(context.get(), problem, gemm); + + // Generate a kernel for testing address calculation and run. + // Verification of the result is done. + kernel.test_equal(); + } } } } diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index f660e1a0..f92e8d3d 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -27,10 +27,12 @@ #pragma once // -// Value "suites" for arithemtic and expression tests +// Value "suites" for arithmetic and expression tests // #include +#include +#include #include #include @@ -139,15 +141,20 @@ namespace TestValues 42e5}; // Portions of GEMMProblem - struct gemmProblemSize + struct GemmProblemSize { int m; int n; - int macM; - int macN; + int k; }; - inline std::vector gemmProblemSizes = {{128, 128, 64, 64}, {512, 512, 64, 64}}; + inline std::initializer_list gemmProblemSizes + = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}}; + + inline std::initializer_list> macroTileSizes = {{64, 64}}; + // Looks like {128, 128} fails + // {{64, 64}, {128, 128}}; + // Also with the current set up - datatype "double" fails. template struct ByType From 5819487368c74e1ee0a9fc00f489778a8b30cd44 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Mon, 17 Mar 2025 21:23:22 +0000 Subject: [PATCH 22/32] Changes with bitwiseand --- lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp | 38 ++++++++++++++++++-- test/catch/AddressCalculationTest.cpp | 9 ++++- test/common/common/TestValues.hpp | 3 +- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp index bfed647f..6ac110c6 100644 --- a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp +++ b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp @@ -52,9 +52,26 @@ namespace rocRoller AssertFatal(lhs != nullptr); AssertFatal(rhs != nullptr); - auto elementBits = std::max({DataTypeInfo::Get(dest->variableType()).elementBits, - DataTypeInfo::Get(lhs->variableType()).elementBits, - DataTypeInfo::Get(rhs->variableType()).elementBits}); + auto destNumBits = DataTypeInfo::Get(dest->variableType()).elementBits; + auto lhsNumBits = DataTypeInfo::Get(lhs->variableType()).elementBits; + auto rhsNumBits = DataTypeInfo::Get(rhs->variableType()).elementBits; + + auto elementBits = std::max({destNumBits, lhsNumBits, rhsNumBits}); + + // Assertions on number of bits. + if(elementBits <= 32u) + { + // We don't support BitwiseAnd of less than 32-bit, + // gfx9 doesn't have s_and or v_and less than b32. + AssertFatal (lhsNumBits == rhsNumBits && lhsNumBits == 32, + "For 32-bit BitwiseAnd, both operands should be 32-bit wide."); + } else if (elementBits == 64u) { + // I am not aware a path that can lead to destNumbBits being smaller than 64-bit when the control reached here. + AssertFatal (destNumBits == 64u); + } else { + Throw("Unsupported elementBits for bitwiseAnd operation:: ", + ShowValue(elementBits)); + } if(dest->regType() == Register::Type::Scalar) { @@ -82,6 +99,20 @@ namespace rocRoller } else if(elementBits == 64u) { + Register::ValuePtr l0, l1, r0, r1; + if (lhs->regType() == Register::Type::Scalar) { + co_yield get2DwordsScalar(l0, l1, lhs); + } else { + co_yield get2DwordsVector(l0, l1, lhs); + } + co_yield get2DwordsVector(r0, r1, rhs); + + co_yield_(Instruction( + "v_and_b32", {dest->subset({0})}, {l0, r0}, {}, "")); + co_yield_(Instruction( + "v_and_b32", {dest->subset({1})}, {l1, r1}, {}, "")); + +#if 0 if(lhs->regType() == Register::Type::Literal) { Register::ValuePtr lsb; @@ -107,6 +138,7 @@ namespace rocRoller {}, "")); } +#endif } else { diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index b7e57a07..5564ec6c 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -863,6 +863,7 @@ namespace AddressCalculationTest std::vector m_hostBuffer2; }; +#if 0 TEST_CASE("address calculation test generate and run", "[expression][gpu]") { // Noticed that for "float" type, all different combinations of @@ -892,7 +893,11 @@ namespace AddressCalculationTest // Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa. // Bitwise expression generator doesn't promote operands' datatype whereas // other binary arithmetic operations, e.g. Add, do the regType/dataType promotion. +#if 0 auto singleDataType = GENERATE(DataType::Float); +#else + auto singleDataType = GENERATE(DataType::Float, DataType::Double); +#endif //CAPTURE(singleDataType); //INFO("s" << singleDataType); std::cout << "singleType: " << singleDataType << "\n"; @@ -935,7 +940,8 @@ namespace AddressCalculationTest } } } - +#endif +#if 1 TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "128x128_one_pair"); @@ -971,4 +977,5 @@ namespace AddressCalculationTest AddressCalculationTest kernel(context.get(), problem, gemm); kernel.test_sanity_indices(); } +#endif } diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index f92e8d3d..90f31835 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -151,7 +151,8 @@ namespace TestValues inline std::initializer_list gemmProblemSizes = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}}; - inline std::initializer_list> macroTileSizes = {{64, 64}}; + // inline std::initializer_list> macroTileSizes = {{64, 64}}; + inline std::initializer_list> macroTileSizes = {{128, 128}}; // Looks like {128, 128} fails // {{64, 64}, {128, 128}}; // Also with the current set up - datatype "double" fails. From 334635898d82a07d50b7872214a111ec4bda54b7 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Mon, 17 Mar 2025 23:02:08 +0000 Subject: [PATCH 23/32] Fix compile-errors after merge from origin/main --- test/catch/AddressCalculationTest.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 5564ec6c..c8e72511 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -129,10 +129,8 @@ namespace AddressCalculationTest // fastArithmetic is being used eventually when the expressions are generated. auto identity_transducer = [&](auto expr) { return expr; }; auto coords = CoordG::Transformer( - std::make_shared( - m_kGraph.coordinates), - m_context, - identity_transducer); + + &(m_kGraph.coordinates), identity_transducer); auto fullStop = [&](int tag) { return tag == increment; }; auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream; @@ -153,7 +151,7 @@ namespace AddressCalculationTest // Compute an offset address if we don't have an // associated base address to inherit from { - // base < 0 by the time control reacheds here. + // base < 0 by the time control reached here. auto indexExpr = ci.forward ? coords.forward({target})[0] : coords.reverse({target})[0]; @@ -802,9 +800,8 @@ namespace AddressCalculationTest auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); { auto boolType = resultVariableType(is_zero_diff).dataType; - AssertFatal(boolType == DataType::Bool64, - "is_zero type {}", - toString(boolType)); + AssertFatal( + boolType == DataType::Bool64, "is_zero type: ", toString(boolType)); } // boolean_true = boolean_true & is_zero_diff @@ -863,7 +860,7 @@ namespace AddressCalculationTest std::vector m_hostBuffer2; }; -#if 0 +#if 1 TEST_CASE("address calculation test generate and run", "[expression][gpu]") { // Noticed that for "float" type, all different combinations of @@ -941,7 +938,7 @@ namespace AddressCalculationTest } } #endif -#if 1 +#if 0 TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "128x128_one_pair"); From 064d8e7db72e314d25d07f38b1e00b6afbbbe01d Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 18 Mar 2025 19:15:32 +0000 Subject: [PATCH 24/32] clean ups and for debugging --- test/catch/AddressCalculationTest.cpp | 9 +++++++-- test/common/common/TestValues.hpp | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index c8e72511..34999f30 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -795,8 +795,13 @@ namespace AddressCalculationTest // Compute the value 2 Register::ValuePtr v_value_2 = nullptr; co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context); - - // Compute diff (value_1 - value_2) +#if 1 + std::cout << "v_value_1 resultType: " << rocRoller::Expression::resultVariableType(v_value_1) << "\n"; + std::cout << (v_value_1->toString()) << " " << (v_value_1->description()) << "\n"; + std::cout << "v_value_2 resultType: " << rocRoller::Expression::resultVariableType(v_value_2) << "\n"; + std::cout << (v_value_2->toString()) << " " << (v_value_2->description()) << "\n"; +#endif + // Compute diff (value_1 == value_2) auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); { auto boolType = resultVariableType(is_zero_diff).dataType; diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index 90f31835..b73ba254 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -151,8 +151,7 @@ namespace TestValues inline std::initializer_list gemmProblemSizes = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}}; - // inline std::initializer_list> macroTileSizes = {{64, 64}}; - inline std::initializer_list> macroTileSizes = {{128, 128}}; + inline std::initializer_list> macroTileSizes = {{64, 64}, {128, 128}}; // Looks like {128, 128} fails // {{64, 64}, {128, 128}}; // Also with the current set up - datatype "double" fails. From 8b2d8bf2efa86354768c3697bcdf75df369c62fd Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 19 Mar 2025 01:26:50 +0000 Subject: [PATCH 25/32] Fixing logical errors in AddressCalculationTest after merging Scott's big PR --- .../KernelGraph/RegisterTagManager_impl.hpp | 5 +++++ test/catch/AddressCalculationTest.cpp | 19 +++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp index dffd2fd2..7efbc999 100644 --- a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp +++ b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp @@ -122,8 +122,13 @@ namespace rocRoller inline void RegisterTagManager::addRegister(int tag, Register::ValuePtr value) { + AssertFatal(!hasExpression(tag), "Tag already associated with an expression"); +#if 0 + // Assertion fails on AddressCalculationTest - s2, s3 for workgroupIndex X and Y. AssertFatal(!hasRegister(tag), "Tag ", tag, " already in RegisterTagManager."); +#endif + if(auto existingTag = findRegister(value)) { AssertFatal(value->readOnly(), diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 34999f30..eda30178 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -131,6 +131,7 @@ namespace AddressCalculationTest auto coords = CoordG::Transformer( &(m_kGraph.coordinates), identity_transducer); + coords.fillExecutionCoordinates(m_context); auto fullStop = [&](int tag) { return tag == increment; }; auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream; @@ -795,12 +796,7 @@ namespace AddressCalculationTest // Compute the value 2 Register::ValuePtr v_value_2 = nullptr; co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context); -#if 1 - std::cout << "v_value_1 resultType: " << rocRoller::Expression::resultVariableType(v_value_1) << "\n"; - std::cout << (v_value_1->toString()) << " " << (v_value_1->description()) << "\n"; - std::cout << "v_value_2 resultType: " << rocRoller::Expression::resultVariableType(v_value_2) << "\n"; - std::cout << (v_value_2->toString()) << " " << (v_value_2->description()) << "\n"; -#endif + // Compute diff (value_1 == value_2) auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); { @@ -865,7 +861,6 @@ namespace AddressCalculationTest std::vector m_hostBuffer2; }; -#if 1 TEST_CASE("address calculation test generate and run", "[expression][gpu]") { // Noticed that for "float" type, all different combinations of @@ -895,11 +890,9 @@ namespace AddressCalculationTest // Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa. // Bitwise expression generator doesn't promote operands' datatype whereas // other binary arithmetic operations, e.g. Add, do the regType/dataType promotion. -#if 0 - auto singleDataType = GENERATE(DataType::Float); -#else + auto singleDataType = GENERATE(DataType::Float, DataType::Double); -#endif + //CAPTURE(singleDataType); //INFO("s" << singleDataType); std::cout << "singleType: " << singleDataType << "\n"; @@ -942,8 +935,7 @@ namespace AddressCalculationTest } } } -#endif -#if 0 + TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "128x128_one_pair"); @@ -979,5 +971,4 @@ namespace AddressCalculationTest AddressCalculationTest kernel(context.get(), problem, gemm); kernel.test_sanity_indices(); } -#endif } From b0899d1675ba66e4c23fb402d515841f1c15cb48 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 19 Mar 2025 01:45:28 +0000 Subject: [PATCH 26/32] Remove std::cout and add CAPTURE --- test/catch/AddressCalculationTest.cpp | 153 +++++++++++--------------- 1 file changed, 66 insertions(+), 87 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index eda30178..0afbaa1f 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -622,16 +622,12 @@ namespace AddressCalculationTest for(int i = 0, size = m_hostBuffer.size(); i < m_totalWorkitemCount; i++) { - // For 128 by 128 output matrix, workgroupCount computed is {512, 2, 1} - if(toString(m_hostBuffer[i]) != host_x || toString(m_hostBuffer2[i]) != host_y) - { - std::cout - << "workitemCount.x and workitemCount.y in kernel for global workitem " << i - << ": " << m_hostBuffer[i] << ", " << m_hostBuffer2[i] << "\n"; - std::cout << "workitemCount.x and workitemCount.y in host for global workitem " - << i << ": " << host_x << ", " << host_y << "\n"; - } + // workitemCount.x in device and host. + CAPTURE(toString(m_hostBuffer[i]), host_x, i); CHECK(toString(m_hostBuffer[i]) == host_x); + + // workitemCount.y in device and host. + CAPTURE(toString(m_hostBuffer2[i]), host_y, i); CHECK(toString(m_hostBuffer2[i]) == host_y); } } @@ -752,77 +748,74 @@ namespace AddressCalculationTest auto allone_uint64 = std::make_shared(static_cast(0xFFFFFFFF)); - auto kb = [&]() - // return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]() - -> Generator { - // store base addr - Register::ValuePtr s_ptr; + // context, input, widenedInput, workitemCount, workgroupSize, allone_uint64 are used + // inside the kb. + auto kb = [&]() -> Generator { + // store base addr + Register::ValuePtr s_ptr; - co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr); + co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr); - Register::ValuePtr s_ptr2; + Register::ValuePtr s_ptr2; - co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2); + co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2); - // 2-D - auto compare_res_pointer = get64BitVectorOffset( - m_context, - m_context->kernel()->workitemCount(), - (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()) - .value()); - Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + // 2-D + auto compare_res_pointer = get64BitVectorOffset( + m_context, + m_context->kernel()->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value()); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); - Register::ValuePtr v_offset = nullptr; - co_yield Expression::generate( - v_offset, compare_res_pointer + s_ptr->expression(), m_context); + Register::ValuePtr v_offset = nullptr; + co_yield Expression::generate( + v_offset, compare_res_pointer + s_ptr->expression(), m_context); - // boolean_diff was allocated to s[0:1] - // diff should be computed per lane, - // but is_zero_diff is actually one-bit - auto boolean_true = Register::Value::WavefrontPlaceholder(m_context); - Register::ValuePtr v_allone; - co_yield Expression::generate(v_allone, allone_uint64, m_context); + // boolean_diff was allocated to s[0:1] + // diff should be computed per lane, + // but is_zero_diff is actually one-bit + auto boolean_true = Register::Value::WavefrontPlaceholder(m_context); + Register::ValuePtr v_allone; + co_yield Expression::generate(v_allone, allone_uint64, m_context); - co_yield m_context->copier()->copy( - boolean_true, v_allone, "set to true for all lanes"); + co_yield m_context->copier()->copy( + boolean_true, v_allone, "set to true for all lanes"); - Register::ValuePtr temp_res; - for(int i = 0, size = indexExprPtrs.size(); i < size; i++) + Register::ValuePtr temp_res; + for(int i = 0, size = indexExprPtrs.size(); i < size; i++) + { + // Compute the value 1 + Register::ValuePtr v_value_1 = nullptr; + co_yield Expression::generate(v_value_1, indexExprPtrs[i], m_context); + + // Compute the value 2 + Register::ValuePtr v_value_2 = nullptr; + co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context); + + // Compute diff (value_1 == value_2) + auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); { - // Compute the value 1 - Register::ValuePtr v_value_1 = nullptr; - co_yield Expression::generate(v_value_1, indexExprPtrs[i], m_context); - - // Compute the value 2 - Register::ValuePtr v_value_2 = nullptr; - co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context); - - // Compute diff (value_1 == value_2) - auto is_zero_diff = v_value_1->expression() == v_value_2->expression(); - { - auto boolType = resultVariableType(is_zero_diff).dataType; - AssertFatal( - boolType == DataType::Bool64, "is_zero type: ", toString(boolType)); - } - - // boolean_true = boolean_true & is_zero_diff - auto accumRes - = std::make_shared(Expression::BitwiseAnd{ - boolean_true->expression(), is_zero_diff, "accum"}); - { - auto accumType = resultVariableType(accumRes).dataType; - AssertFatal(accumType == DataType::Bool64, - "accum type {}", - toString(accumType)); - } - co_yield Expression::generate(boolean_true, accumRes, m_context); + auto boolType = resultVariableType(is_zero_diff).dataType; + AssertFatal( + boolType == DataType::Bool64, "is_zero type: ", toString(boolType)); } - auto v_value = Register::Value::Placeholder( - m_context, Register::Type::Vector, DataType::UInt64, 1); - co_yield m_context->copier()->copy(v_value, boolean_true, "Move value"); - co_yield m_context->mem()->storeGlobal(v_offset, v_value, 0, 8); - }; + // boolean_true = boolean_true & is_zero_diff + auto accumRes = std::make_shared( + Expression::BitwiseAnd{boolean_true->expression(), is_zero_diff, "accum"}); + { + auto accumType = resultVariableType(accumRes).dataType; + AssertFatal( + accumType == DataType::Bool64, "accum type {}", toString(accumType)); + } + co_yield Expression::generate(boolean_true, accumRes, m_context); + } + + auto v_value = Register::Value::Placeholder( + m_context, Register::Type::Vector, DataType::UInt64, 1); + co_yield m_context->copier()->copy(v_value, boolean_true, "Move value"); + co_yield m_context->mem()->storeGlobal(v_offset, v_value, 0, 8); + }; m_context->schedule(kb()); @@ -834,11 +827,8 @@ namespace AddressCalculationTest for(int i = 0, size = m_hostBuffer.size(); i < size; i++) { - if(m_hostBuffer[i] != 0xFFFFFFFF) - { - std::cout << "The addresses are not same at " << i << " " << m_hostBuffer[i] - << "\n"; - } + // The two addresses are not same at i. + CAPTURE(i, m_hostBuffer[i]); CHECK(m_hostBuffer[i] == 0xFFFFFFFF); } @@ -882,19 +872,10 @@ namespace AddressCalculationTest // Or, it could be simply a bug. // Called single as the one data type is applied to all A, B, C and D matrices. - // TODO: Add more dataTypes. Also notice other TODO in the function "setTensorArguments()", - // where device pointer for matrices are allocated. Currently, only "float" is used. - // TODO: Debug. With DataType::Double, the test fails. - // The bug is from generating a widened expression. - // When it was generated, fast modulo introduces BitwiseAnd operations. - // Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa. - // Bitwise expression generator doesn't promote operands' datatype whereas - // other binary arithmetic operations, e.g. Add, do the regType/dataType promotion. + // TODO: check transpose auto singleDataType = GENERATE(DataType::Float, DataType::Double); - //CAPTURE(singleDataType); - //INFO("s" << singleDataType); std::cout << "singleType: " << singleDataType << "\n"; DYNAMIC_SECTION(singleDataType) { @@ -909,8 +890,6 @@ namespace AddressCalculationTest auto [macM, macN] = GENERATE(values(TestValues::macroTileSizes)); DYNAMIC_SECTION("mc_" << macM << "x" << macN) { - //CAPTURE(problemSize); - //INFO("p " << problemSize); std::cout << "macM: " << macM << "\n"; std::cout << "macN: " << macN << "\n"; @@ -971,4 +950,4 @@ namespace AddressCalculationTest AddressCalculationTest kernel(context.get(), problem, gemm); kernel.test_sanity_indices(); } -} +} \ No newline at end of file From 421daf5d3f8e70b16a52908bc7b7f20302cd2363 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 19 Mar 2025 21:06:42 +0000 Subject: [PATCH 27/32] Changing/Remove codes after merging from origin/main --- test/catch/AddressCalculationTest.cpp | 6 +- test/common/WidenTo64bit.cpp | 274 -------------------------- test/common/common/WidenTo64bit.hpp | 12 -- 3 files changed, 3 insertions(+), 289 deletions(-) delete mode 100644 test/common/WidenTo64bit.cpp delete mode 100644 test/common/common/WidenTo64bit.hpp diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 0afbaa1f..24f4bac5 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -697,7 +697,7 @@ namespace AddressCalculationTest auto eptr = indexExprPtrs[0]; Log::debug("== Expr : {} ", toString(eptr)); - widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr)); + widenedExprPtrs.push_back(rocRollerTest::widenAddrExprTo64bit(eptr)); Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); auto fast = Expression::FastArithmetic(m_context); @@ -741,7 +741,7 @@ namespace AddressCalculationTest auto eptr = indexExprPtrs[i]; Log::debug("== Expr : {} ", toString(eptr)); - widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr)); + widenedExprPtrs.push_back(rocRollerTest::widenAddrExprTo64bit(eptr)); Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back())); } diff --git a/test/common/WidenTo64bit.cpp b/test/common/WidenTo64bit.cpp deleted file mode 100644 index dfb21645..00000000 --- a/test/common/WidenTo64bit.cpp +++ /dev/null @@ -1,274 +0,0 @@ -#include -#include -#include - -template -constexpr auto cast_to_unsigned(T val) -{ - return static_cast::type>(val); -} - -namespace rocRollerTest -{ - - using namespace rocRoller; - - struct WidenTo64BitVisitor - { - - Expression::ExpressionPtr operator()(Expression::Convert const& expr) const - { - Expression::Convert cpy = expr; - if(expr.arg) - { - // Here is an assumption that call(cpy.arg) never goes above 64-bit and - // input convert's destination types are either int32, uint32, int64 or uint64. - // resultVaribleType(expr.arg) is not called intentionally as it will visit the - // subtree of expr.arg again. - // We can also import similar logic from ExpressionResultTypeVisitor and - // make the operator()(...) return a pair of Expression::ExpressionPtr and VariableType - // in order to avoid repeated visit of the subtree of cpy.arg. - cpy.arg = call(expr.arg); - if(expr.destinationType == DataType::UInt32) - return convert(DataType::UInt64, cpy.arg); - else if(expr.destinationType == DataType::Int32) - return convert(DataType::Int64, cpy.arg); - else if(expr.destinationType == DataType::UInt64 - || expr.destinationType == DataType::Int64) - return convert(expr.destinationType, cpy.arg); - else - { - AssertFatal(false, - "Expected Destination type for a convert is either (u)int{32,64}"); - return nullptr; - } - } - - return std::make_shared(cpy); - } - - Expression::ExpressionPtr operator()(Expression::Negate const& expr) const - { - Expression::Negate cpy = expr; - if(expr.arg) - { - cpy.arg = call(expr.arg); - } - - return std::make_shared(cpy); - } - - template - Expression::ExpressionPtr operator()(Expr const& expr) const - { - // Only Expression::Convert and perhaps negates are expected. - AssertFatal(false, "Unexpected Unary expression", ShowValue(expr)); - return nullptr; - } - - Expression::ExpressionPtr operator()(Expression::Divide const& expr) const - { - // TODO: Add check that divisor is a CValue within (u)int32-bit - Log::debug("Divisor: {} ", toString(expr.rhs)); - AssertFatal(isExpectedLeafType(expr.rhs)); - - Expression::Divide cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - - return std::make_shared(cpy); - } - - Expression::ExpressionPtr operator()(Expression::Modulo const& expr) const - { - // TODO: Add check that divisor is a CValue within (u)int32-bit - Log::debug("Modulo: {} ", toString(expr.rhs)); - AssertFatal(isExpectedLeafType(expr.rhs)); - - Expression::Modulo cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - - return std::make_shared(cpy); - } - - // TODO: for some shifts, logical, subtract operations these may not be correct. - // Not sure yet if address calculation expressions with identity_transducer - // those types of Expression or not. - template - requires(Expression::CArithmetic) Expression::ExpressionPtr - operator()(Expr const& expr) const - { - if constexpr(std::same_as) - { - AssertFatal(false, "Subtracts are not expected"); - return nullptr; - } - - if constexpr(Expression::CLogical) - { - AssertFatal(false, "logicals are not expected"); - return nullptr; - } - - Expr cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - if(expr.rhs) - cpy.rhs = call(expr.rhs); - - return std::make_shared(cpy); - } - - // Even with catch-all operator()(Expression::ExpressionPtr) without following, - // compilation fails. - template - requires(Expression::CBinary) Expression::ExpressionPtr - operator()(Expr const& expr) const - { - AssertFatal(false, "Not expected expr : ", ShowValue(expr)); - return nullptr; - } - - template - requires(Expression::CArithmetic) Expression::ExpressionPtr - operator()(Expr const& expr) const - { - Expr cpy = expr; - if(expr.lhs) - cpy.lhs = call(expr.lhs); - if(expr.r1hs) - cpy.r1hs = call(expr.r1hs); - if(expr.r2hs) - cpy.r2hs = call(expr.r2hs); - - return std::make_shared(cpy); - } - - template - requires(Expression::CTernary) Expression::ExpressionPtr - operator()(Expr const& expr) const - { - AssertFatal(false, "Not expected expr : ", ShowValue(expr)); - return nullptr; - } - - // leaves - Expression::ExpressionPtr operator()(CommandArgumentPtr const& expr) const - { - Log::debug("CommandArgumentPtr {}", Expression::toString(expr)); - - auto varType = expr->variableType(); - - assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); - - CommandArgumentPtr cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - Expression::ExpressionPtr operator()(CommandArgumentValue const& expr) const - { - Log::debug("CommandArgumentValue {} Type {} ", - Expression::toString(expr), - toString(variableType(expr))); - - auto varType = variableType(expr); - - assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); - - CommandArgumentValue cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - Expression::ExpressionPtr operator()(Register::ValuePtr const& expr) const - { - Log::debug("Register::ValuePtr {}", Expression::toString(expr)); - - auto varType = expr->variableType(); - - assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); - - Register::ValuePtr cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - Expression::ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const - { - Log::debug("AssemblyKernelArgumentPtr {} its expression is {}", - Expression::toString(expr), - Expression::toString(expr->expression)); - - auto varType = expr->variableType; - - assertIfNotExpectedType(varType.dataType, Expression::toString(expr)); - - AssemblyKernelArgumentPtr cpy = expr; - return widenTo64(varType.dataType, cpy); - } - - // catch the rest CValue - template - Expression::ExpressionPtr operator()(Value const& expr) const - { - AssertFatal( - false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr)); - return nullptr; - } - - Expression::ExpressionPtr operator()(Expression::Expression const& expr) const - { - AssertFatal(false, "No expectation to meet this type of Expression: ", ShowValue(expr)); - return nullptr; - } - - template - Expression::ExpressionPtr widenTo64(DataType srcType, T const& expr) const - { - if(srcType == DataType::UInt32) - return convert(DataType::UInt64, std::make_shared(expr)); - else if(srcType == DataType::Int32) - return convert(DataType::Int64, std::make_shared(expr)); - - return std::make_shared(expr); - } - - Expression::ExpressionPtr call(Expression::ExpressionPtr const& expr) const - { - return std::visit(*this, *expr); - } - - void assertIfNotExpectedType(DataType dt, std::string const& showValue) const - { - AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32 || dt == DataType::UInt64 - || dt == DataType::Int64, - "Unexpected DataType for Command/Kernel arguments or " - "workgroup/item indices ", - showValue); - } - - bool isExpectedLeafType(Expression::ExpressionPtr const& expr) const { - return std::holds_alternative(*expr) || - std::holds_alternative(*expr) || - std::holds_alternative(*expr) || - std::holds_alternative(*expr); - } - - }; - - Expression::ExpressionPtr widenTo64bit(Expression::ExpressionPtr expr) - { - auto origVarType = resultVariableType(expr); - - auto visitor = WidenTo64BitVisitor(); - auto widened = visitor.call(expr); - - auto finalVarType = resultVariableType(widened); - - AssertFatal(origVarType.dataType == finalVarType.dataType, - "Original and final data types should be the same", - ShowValue(origVarType.dataType), - ShowValue(finalVarType.dataType)); - - return widened; - } -} diff --git a/test/common/common/WidenTo64bit.hpp b/test/common/common/WidenTo64bit.hpp deleted file mode 100644 index 33a84085..00000000 --- a/test/common/common/WidenTo64bit.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -namespace rocRollerTest { -/** - * @brief Widen (u)int32 to (u)int64. - * - * Has many assumptions in input expr. See the implementation for details. - * - * @param expr Input expression - * @return ExpressionPtr Transformed expression - */ - rocRoller::Expression::ExpressionPtr widenTo64bit(rocRoller::Expression::ExpressionPtr expr); -} From 3be55657305b82c423e7152810ff3f91d172af65 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Wed, 19 Mar 2025 22:39:10 +0000 Subject: [PATCH 28/32] Move kerner bodies into test --- test/catch/AddressCalculationTest.cpp | 284 ++++++++++++-------------- 1 file changed, 131 insertions(+), 153 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 24f4bac5..3515bc5d 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -305,135 +305,6 @@ namespace AddressCalculationTest return compare_res_pointer; } - // This is for printing out workgroup index and thread index. - auto kb_sanity_indices(ContextPtr context, - std::array const& workitemCount, - std::array const& workgroupSize) - { - return [context, workitemCount, workgroupSize]() -> Generator { - // store base addr - Register::ValuePtr s_ptr; - co_yield context->argLoader()->getValue("rv_ptr", s_ptr); - - Register::ValuePtr s_ptr2; - co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); - - auto compare_res_pointer - = get64BitVectorOffset(context, workitemCount, workgroupSize); - Log::debug("Offset in kb: {}", toString(compare_res_pointer)); - - Register::ValuePtr v_offset_1 = nullptr; - co_yield Expression::generate( - v_offset_1, compare_res_pointer + s_ptr->expression(), context); - - Register::ValuePtr v_offset_2 = nullptr; - co_yield Expression::generate( - v_offset_2, compare_res_pointer + s_ptr2->expression(), context); - - // workgroupIndex x - auto v_wg_x = Register::Value::Placeholder( - context, Register::Type::Vector, DataType::UInt32, 1); - co_yield context->copier()->copy( - v_wg_x, context->kernel()->workgroupIndex()[0], "copy wgi.x to v"); - co_yield context->mem()->storeGlobal(v_offset_1, v_wg_x, 0, 4); - - co_yield context->mem()->storeGlobal( - v_offset_2, (context->kernel()->workitemIndex())[0], 0, 4); - }; - } - - // Just print out passed expressions to device memory. - // If passed expressions are workitemCounts, the expectation is that - // the generated expressions' values are the same with the host-side computed values. - auto kb_implicit_workitemcount(ContextPtr context, - Expression::ExpressionPtr const& workitemcount_X, - Expression::ExpressionPtr const& workitemcount_Y, - std::array const& workgroupSize) - { - return [context, - workitemcount_X, - workitemcount_Y, - workgroupSize]() -> Generator { - // store base addrs - Register::ValuePtr s_ptr; - co_yield context->argLoader()->getValue("rv_ptr", s_ptr); - Register::ValuePtr s_ptr2; - co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); - auto compare_res_pointer = get64BitVectorOffset( - context, {workitemcount_X, workitemcount_Y}, workgroupSize); - Log::debug("Offset in kb: {}", toString(compare_res_pointer)); - - Register::ValuePtr v_offset_1 = nullptr; - co_yield Expression::generate( - v_offset_1, compare_res_pointer + s_ptr->expression(), context); - - Register::ValuePtr v_offset_2 = nullptr; - co_yield Expression::generate( - v_offset_2, compare_res_pointer + s_ptr2->expression(), context); - - // Compute the value 1 - Register::ValuePtr s_value_1 = nullptr; - co_yield Expression::generate(s_value_1, workitemcount_X, context); - auto v_value_11 = Register::Value::Placeholder( - context, Register::Type::Vector, DataType::Int64, 1); - co_yield context->copier()->copy(v_value_11, s_value_1, "copy to v1"); - - // Compute the value 2 - Register::ValuePtr s_value_2 = nullptr; - co_yield Expression::generate(s_value_2, workitemcount_Y, context); - auto v_value_22 = Register::Value::Placeholder( - context, Register::Type::Vector, DataType::Int64, 1); - co_yield context->copier()->copy(v_value_22, s_value_2, "copy to v2"); - - co_yield context->mem()->storeGlobal(v_offset_1, v_value_11, 0, 8); - co_yield context->mem()->storeGlobal(v_offset_2, v_value_22, 0, 8); - }; - } - - // For now, mainly for debugging, directly copy the two results values. - // workitemCount is passed as argument. Since it is 64-bit - // different global_store_dwordx2 format should be used. - auto kb_equal_one(ContextPtr context, - Expression::ExpressionPtr const& input, - Expression::ExpressionPtr const& widenedInput, - std::array const& workitemCount, - std::array const& workgroupSize) - { - return [context, input, widenedInput, workitemCount, workgroupSize]() - -> Generator { - // store base addr - Register::ValuePtr s_ptr; - co_yield context->argLoader()->getValue("rv_ptr", s_ptr); - - Register::ValuePtr s_ptr2; - co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2); - - auto compare_res_pointer - = get64BitVectorOffset(context, workitemCount, workgroupSize); - Log::debug("Offset in kb: {}", toString(compare_res_pointer)); - - Register::ValuePtr v_offset_1 = nullptr; - co_yield Expression::generate( - v_offset_1, compare_res_pointer + s_ptr->expression(), context); - - Register::ValuePtr v_offset_2 = nullptr; - co_yield Expression::generate( - v_offset_2, compare_res_pointer + s_ptr2->expression(), context); - - // Compute the value 1 - Register::ValuePtr v_value_1 = nullptr; - co_yield Expression::generate(v_value_1, input, context); - - // Compute the value 2 - Register::ValuePtr v_value_2 = nullptr; - - co_yield Expression::generate(v_value_2, widenedInput, context); - - co_yield context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8); - co_yield context->mem()->storeGlobal(v_offset_2, v_value_2, 0, 8); - }; - } - void setTensorArguments(CommandArguments& commandArgs, GEMMProblem const& problem, rocRollerTest::Graphs::GEMM const& gemm, @@ -600,11 +471,50 @@ namespace AddressCalculationTest generateOrigKernelByProlog(); auto k = m_context->kernel(); - m_context->schedule(kb_implicit_workitemcount( - m_context, - k->workitemCount()[0], - k->workitemCount()[1], - (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + + // [context, workitemcount_X, workitemcount_Y, workgroupSize] are used. + // Just print out passed expressions to device memory. + // If passed expressions are workitemCounts, the expectation is that + // the generated expressions' values are the same with the host-side computed values. + auto kb = [&]() -> Generator { + // store base addrs + Register::ValuePtr s_ptr; + co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr); + Register::ValuePtr s_ptr2; + co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2); + auto compare_res_pointer = get64BitVectorOffset( + m_context, + k->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value()); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset_1 = nullptr; + co_yield Expression::generate( + v_offset_1, compare_res_pointer + s_ptr->expression(), m_context); + + Register::ValuePtr v_offset_2 = nullptr; + co_yield Expression::generate( + v_offset_2, compare_res_pointer + s_ptr2->expression(), m_context); + + // Compute the value 1 + Register::ValuePtr s_value_1 = nullptr; + co_yield Expression::generate(s_value_1, k->workitemCount()[0], m_context); + auto v_value_11 = Register::Value::Placeholder( + m_context, Register::Type::Vector, DataType::Int64, 1); + co_yield m_context->copier()->copy(v_value_11, s_value_1, "copy to v1"); + + // Compute the value 2 + Register::ValuePtr s_value_2 = nullptr; + co_yield Expression::generate(s_value_2, k->workitemCount()[1], m_context); + auto v_value_22 = Register::Value::Placeholder( + m_context, Register::Type::Vector, DataType::Int64, 1); + co_yield m_context->copier()->copy(v_value_22, s_value_2, "copy to v2"); + + co_yield m_context->mem()->storeGlobal(v_offset_1, v_value_11, 0, 8); + co_yield m_context->mem()->storeGlobal(v_offset_2, v_value_22, 0, 8); + }; + + m_context->schedule(kb()); m_context->schedule(k->postamble()); m_context->schedule(k->amdgpu_metadata()); @@ -632,16 +542,47 @@ namespace AddressCalculationTest } } - void test_sanity_indices() + void test_wg_thr_indices() { generateOrigKernelByProlog(); auto k = m_context->kernel(); - m_context->schedule(kb_sanity_indices( - m_context, - k->workitemCount(), - (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + // This is for printing out workgroup index and thread index + // [context, workitemCount, workgroupSize] are used. + auto kb = [&]() -> Generator { + // store base addr + Register::ValuePtr s_ptr; + co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr); + + Register::ValuePtr s_ptr2; + co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2); + + auto compare_res_pointer = get64BitVectorOffset( + m_context, + k->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value()); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset_1 = nullptr; + co_yield Expression::generate( + v_offset_1, compare_res_pointer + s_ptr->expression(), m_context); + + Register::ValuePtr v_offset_2 = nullptr; + co_yield Expression::generate( + v_offset_2, compare_res_pointer + s_ptr2->expression(), m_context); + + // workgroupIndex x + auto v_wg_x = Register::Value::Placeholder( + m_context, Register::Type::Vector, DataType::UInt32, 1); + co_yield m_context->copier()->copy( + v_wg_x, k->workgroupIndex()[0], "copy wgi.x to v"); + co_yield m_context->mem()->storeGlobal(v_offset_1, v_wg_x, 0, 4); + + co_yield m_context->mem()->storeGlobal(v_offset_2, (k->workitemIndex())[0], 0, 4); + }; + + m_context->schedule(kb()); m_context->schedule(k->postamble()); m_context->schedule(k->amdgpu_metadata()); @@ -683,7 +624,7 @@ namespace AddressCalculationTest } } - void test_equal_one() + void test_equal_one_pair() { generateOrigKernelByProlog(); @@ -704,12 +645,45 @@ namespace AddressCalculationTest Log::debug("** fast : {} ", toString(fast(widenedExprPtrs.back()))); auto k = m_context->kernel(); - m_context->schedule(kb_equal_one( - m_context, - indexExprPtrs[0], - widenedExprPtrs[0], - k->workitemCount(), - (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())()); + + // context, input, widenedInput, workitemCount, workgroupSize, are used + // inside the kb. + auto kb = [&]() -> Generator { + // store base addr + Register::ValuePtr s_ptr; + co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr); + + Register::ValuePtr s_ptr2; + co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2); + + auto compare_res_pointer = get64BitVectorOffset( + m_context, + k->workitemCount(), + (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value()); + Log::debug("Offset in kb: {}", toString(compare_res_pointer)); + + Register::ValuePtr v_offset_1 = nullptr; + co_yield Expression::generate( + v_offset_1, compare_res_pointer + s_ptr->expression(), m_context); + + Register::ValuePtr v_offset_2 = nullptr; + co_yield Expression::generate( + v_offset_2, compare_res_pointer + s_ptr2->expression(), m_context); + + // Compute the value 1 + Register::ValuePtr v_value_1 = nullptr; + co_yield Expression::generate(v_value_1, indexExprPtrs[0], m_context); + + // Compute the value 2 + Register::ValuePtr v_value_2 = nullptr; + + co_yield Expression::generate(v_value_2, widenedExprPtrs[0], m_context); + + co_yield m_context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8); + co_yield m_context->mem()->storeGlobal(v_offset_2, v_value_2, 0, 8); + }; + + m_context->schedule(kb()); m_context->schedule(k->postamble()); m_context->schedule(k->amdgpu_metadata()); @@ -727,7 +701,7 @@ namespace AddressCalculationTest } } - void test_equal() + void test_equal_all_pairs() { generateOrigKernelByProlog(); @@ -748,6 +722,8 @@ namespace AddressCalculationTest auto allone_uint64 = std::make_shared(static_cast(0xFFFFFFFF)); + auto k = m_context->kernel(); + // context, input, widenedInput, workitemCount, workgroupSize, allone_uint64 are used // inside the kb. auto kb = [&]() -> Generator { @@ -763,7 +739,7 @@ namespace AddressCalculationTest // 2-D auto compare_res_pointer = get64BitVectorOffset( m_context, - m_context->kernel()->workitemCount(), + k->workitemCount(), (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value()); Log::debug("Offset in kb: {}", toString(compare_res_pointer)); @@ -819,7 +795,6 @@ namespace AddressCalculationTest m_context->schedule(kb()); - auto k = m_context->kernel(); m_context->schedule(k->postamble()); m_context->schedule(k->amdgpu_metadata()); @@ -909,12 +884,13 @@ namespace AddressCalculationTest // Generate a kernel for testing address calculation and run. // Verification of the result is done. - kernel.test_equal(); + kernel.test_equal_all_pairs(); } } } } + // Following test checks only one-pair, simpler version of above. TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "128x128_one_pair"); @@ -924,9 +900,10 @@ namespace AddressCalculationTest gemm.setProblem(problem); AddressCalculationTest kernel(context.get(), problem, gemm); - kernel.test_equal_one(); + kernel.test_equal_one_pair(); } + // Sanity check 1 TEST_CASE("address calculation test implicit workitemcount", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "impl_workitemcnt"); @@ -939,7 +916,8 @@ namespace AddressCalculationTest kernel.test_implicit_workitemcount(); } - TEST_CASE("address calculation test sanity check", "[expression][gpu]") + // Sanity check 2 + TEST_CASE("address calculation test workgroup thread index", "[expression][gpu]") { auto context = TestContext::ForTestDevice({}, "128x128_sanity_indices"); @@ -948,6 +926,6 @@ namespace AddressCalculationTest gemm.setProblem(problem); AddressCalculationTest kernel(context.get(), problem, gemm); - kernel.test_sanity_indices(); + kernel.test_wg_thr_indices(); } } \ No newline at end of file From b542e9370089fac5c867fb4040c844d24c2d241d Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 20 Mar 2025 16:13:19 +0000 Subject: [PATCH 29/32] Fix an assertion failure by hoisting coordinate transformer's fillExecutionCoordinates() out of a loop body --- .../InstructionValues/Register_impl.hpp | 3 --- .../KernelGraph/RegisterTagManager_impl.hpp | 5 ----- test/catch/AddressCalculationTest.cpp | 17 ++++++++--------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/lib/include/rocRoller/InstructionValues/Register_impl.hpp b/lib/include/rocRoller/InstructionValues/Register_impl.hpp index 083d24c1..e4dc7e51 100644 --- a/lib/include/rocRoller/InstructionValues/Register_impl.hpp +++ b/lib/include/rocRoller/InstructionValues/Register_impl.hpp @@ -840,9 +840,6 @@ namespace rocRoller std::vector coords; for(auto i : indices) { - if (i < 0 || (size_t)i >= m_allocationCoord.size()) { - Log::debug("FA i {}", i); - } AssertFatal(i >= 0 && (size_t)i < m_allocationCoord.size(), "Register subset out of bounds.", ShowValue(m_allocationCoord.size()), diff --git a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp index 7efbc999..dffd2fd2 100644 --- a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp +++ b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp @@ -122,13 +122,8 @@ namespace rocRoller inline void RegisterTagManager::addRegister(int tag, Register::ValuePtr value) { - AssertFatal(!hasExpression(tag), "Tag already associated with an expression"); -#if 0 - // Assertion fails on AddressCalculationTest - s2, s3 for workgroupIndex X and Y. AssertFatal(!hasRegister(tag), "Tag ", tag, " already in RegisterTagManager."); -#endif - if(auto existingTag = findRegister(value)) { AssertFatal(value->readOnly(), diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 3515bc5d..f3b829ff 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -76,6 +76,14 @@ namespace AddressCalculationTest std::vector rv; auto root = m_kGraph.control.roots().only(); int count = 0; + + // Note that identity_transduer is intentionally being used here in place of FastArithmetic. + // It is alright to use FastArithmetic here but + // fastArithmetic is being used eventually when the expressions are generated. + auto identity_transducer = [&](auto expr) { return expr; }; + auto coords = CoordG::Transformer(&(m_kGraph.coordinates), identity_transducer); + coords.fillExecutionCoordinates(m_context); + for(auto ciTag : filter(isComputeIndex, m_kGraph.control.depthFirstVisit(root.value()))) { @@ -124,15 +132,6 @@ namespace AddressCalculationTest KernelG::Connections::ComputeIndex{ KernelG::Connections::ComputeIndexArgument::INCREMENT}); - // Note that identity_transduer is intentionally being used here in place of FastArithmetic. - // It is alright to use FastArithmetic here but - // fastArithmetic is being used eventually when the expressions are generated. - auto identity_transducer = [&](auto expr) { return expr; }; - auto coords = CoordG::Transformer( - - &(m_kGraph.coordinates), identity_transducer); - coords.fillExecutionCoordinates(m_context); - auto fullStop = [&](int tag) { return tag == increment; }; auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream; auto [required, path] = findRequiredCoordinates(target, direction, fullStop, m_kGraph); From a529b85edbe9f5a3cb7452ac16369b8e4ce970b9 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 20 Mar 2025 16:19:49 -0400 Subject: [PATCH 30/32] Small changes --- lib/source/ExpressionTransformations/FastDivision.cpp | 6 ------ test/catch/AddressCalculationTest.cpp | 6 +++--- test/common/common/TestValues.hpp | 3 --- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/lib/source/ExpressionTransformations/FastDivision.cpp b/lib/source/ExpressionTransformations/FastDivision.cpp index aeef0631..c5af98e4 100644 --- a/lib/source/ExpressionTransformations/FastDivision.cpp +++ b/lib/source/ExpressionTransformations/FastDivision.cpp @@ -329,9 +329,6 @@ namespace rocRoller { unsigned int mask = rhs - 1u; auto new_rhs = literal(mask); -#if 1 - Log::debug("Yoonseo - umodulo - name {} toString {} mask {}", name(new_rhs), toString(new_rhs), mask); -#endif return lhs & new_rhs; } @@ -345,9 +342,6 @@ namespace rocRoller int mask = ~(rhs - 1); auto maskExpr = literal(mask); -#if 1 - Log::debug("Yoonseo - smodulo - name {} toString {} mask {}", name(maskExpr), toString(maskExpr), mask); -#endif auto signBitsExpr = literal(signBits); auto reverseShiftAmountExpr = literal(reverseShiftAmount); diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index f3b829ff..cb4de599 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -894,7 +894,7 @@ namespace AddressCalculationTest { auto context = TestContext::ForTestDevice({}, "128x128_one_pair"); - GEMMProblem problem{.m = 128, .n = 128}; + GEMMProblem problem{.m = 128, .n = 128, .macM = 64, .macN = 64, .macK = 64}; rocRollerTest::Graphs::GEMM gemm(DataType::Float); gemm.setProblem(problem); @@ -907,7 +907,7 @@ namespace AddressCalculationTest { auto context = TestContext::ForTestDevice({}, "impl_workitemcnt"); - GEMMProblem problem{.m = 128, .n = 128}; + GEMMProblem problem{.m = 128, .n = 128, .macM = 64, .macN = 64, .macK = 64}; rocRollerTest::Graphs::GEMM gemm(DataType::Float); gemm.setProblem(problem); @@ -920,7 +920,7 @@ namespace AddressCalculationTest { auto context = TestContext::ForTestDevice({}, "128x128_sanity_indices"); - GEMMProblem problem{.m = 128, .n = 128}; + GEMMProblem problem{.m = 128, .n = 128, .macM = 64, .macN = 64, .macK = 64}; rocRollerTest::Graphs::GEMM gemm(DataType::Float); gemm.setProblem(problem); diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index b73ba254..77db2372 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -152,9 +152,6 @@ namespace TestValues = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}}; inline std::initializer_list> macroTileSizes = {{64, 64}, {128, 128}}; - // Looks like {128, 128} fails - // {{64, 64}, {128, 128}}; - // Also with the current set up - datatype "double" fails. template struct ByType From 8a8439dc0356bdf010bc6edec8ad95a896b5b785 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Tue, 25 Mar 2025 20:08:20 +0000 Subject: [PATCH 31/32] Adding transpose --- test/catch/AddressCalculationTest.cpp | 29 ++++++++++++++++++--------- test/common/common/TestValues.hpp | 2 +- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index cb4de599..07b3d562 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -59,7 +59,7 @@ namespace AddressCalculationTest public: AddressTrace(KernelGraphType const& graph, ContextPtr ctx) : m_kGraph(graph) - , m_context(ctx){}; + , m_context(ctx) {}; std::vector traceComputeIndexWithBuffer(); private: @@ -873,17 +873,28 @@ namespace AddressCalculationTest auto macroTileString = std::to_string(macM) + "x" + std::to_string(macN); auto suffixForKernelName = toString(singleDataType) + "_" + probSizeString + "_" + macroTileString; - auto context = TestContext::ForTestDevice({}, suffixForKernelName); - GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN}; - rocRollerTest::Graphs::GEMM gemm(singleDataType); - gemm.setProblem(problem); + auto [transA, transB] + = GENERATE(values>({{"N", "T"}, {"N", "N"}, {"T", "N"}, {"T", "T"}})); + DYNAMIC_SECTION(transA << transB) + { + std::cout << "transA: " << transA << "\n"; + std::cout << "transB: " << transB << "\n"; + + suffixForKernelName += "_" + transA + transB; + auto context = TestContext::ForTestDevice({}, suffixForKernelName); - AddressCalculationTest kernel(context.get(), problem, gemm); + GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN, + .transA = transA, .transB = transB}; + rocRollerTest::Graphs::GEMM gemm(singleDataType); + gemm.setProblem(problem); - // Generate a kernel for testing address calculation and run. - // Verification of the result is done. - kernel.test_equal_all_pairs(); + AddressCalculationTest kernel(context.get(), problem, gemm); + + // Generate a kernel for testing address calculation and run. + // Verification of the result is done. + kernel.test_equal_all_pairs(); + } } } } diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index 77db2372..d0357a8a 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -151,7 +151,7 @@ namespace TestValues inline std::initializer_list gemmProblemSizes = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}}; - inline std::initializer_list> macroTileSizes = {{64, 64}, {128, 128}}; + inline std::initializer_list> macroTileSizes = {{64, 64}, {128, 128}, {256, 256}}; template struct ByType From f72ba8fa3ead1cbde5e5d245c4f5e48c58080e26 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi Date: Thu, 27 Mar 2025 18:51:00 +0000 Subject: [PATCH 32/32] Clean up some old comments --- test/catch/AddressCalculationTest.cpp | 17 +++++++++++------ test/common/common/TestValues.hpp | 4 +++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp index 07b3d562..c5e677f7 100644 --- a/test/catch/AddressCalculationTest.cpp +++ b/test/catch/AddressCalculationTest.cpp @@ -59,7 +59,7 @@ namespace AddressCalculationTest public: AddressTrace(KernelGraphType const& graph, ContextPtr ctx) : m_kGraph(graph) - , m_context(ctx) {}; + , m_context(ctx){}; std::vector traceComputeIndexWithBuffer(); private: @@ -846,8 +846,8 @@ namespace AddressCalculationTest // Or, it could be simply a bug. // Called single as the one data type is applied to all A, B, C and D matrices. - // TODO: check transpose + // To cut down execution time further, consider running only large matrices. auto singleDataType = GENERATE(DataType::Float, DataType::Double); std::cout << "singleType: " << singleDataType << "\n"; @@ -874,8 +874,8 @@ namespace AddressCalculationTest auto suffixForKernelName = toString(singleDataType) + "_" + probSizeString + "_" + macroTileString; - auto [transA, transB] - = GENERATE(values>({{"N", "T"}, {"N", "N"}, {"T", "N"}, {"T", "T"}})); + auto [transA, transB] = GENERATE(values>( + {{"N", "T"}, {"N", "N"}, {"T", "N"}, {"T", "T"}})); DYNAMIC_SECTION(transA << transB) { std::cout << "transA: " << transA << "\n"; @@ -884,8 +884,13 @@ namespace AddressCalculationTest suffixForKernelName += "_" + transA + transB; auto context = TestContext::ForTestDevice({}, suffixForKernelName); - GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN, - .transA = transA, .transB = transB}; + GEMMProblem problem{.m = m, + .n = n, + .k = k, + .macM = macM, + .macN = macN, + .transA = transA, + .transB = transB}; rocRollerTest::Graphs::GEMM gemm(singleDataType); gemm.setProblem(problem); diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp index d0357a8a..9fc4116d 100644 --- a/test/common/common/TestValues.hpp +++ b/test/common/common/TestValues.hpp @@ -151,7 +151,9 @@ namespace TestValues inline std::initializer_list gemmProblemSizes = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}}; - inline std::initializer_list> macroTileSizes = {{64, 64}, {128, 128}, {256, 256}}; + // Notice that {256, 256} was intentionally avoided due to extremely prolonged time of + // code generation. + inline std::initializer_list> macroTileSizes = {{64, 64}, {128, 128}}; template struct ByType