From 990ff5f3950a933c10dc1f7a3f481b388bc9a47d Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Tue, 4 Mar 2025 16:36:59 +0000
Subject: [PATCH 01/32] Applying first 4 diffs using diff.diff

---
 lib/include/rocRoller/AssemblyKernel.hpp      |   1 +
 lib/include/rocRoller/AssemblyKernel_impl.hpp |   9 +
 lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp  |  38 ++-
 .../WidenTo64bit.cpp                          | 255 ++++++++++++++++++
 4 files changed, 293 insertions(+), 10 deletions(-)
 create mode 100644 lib/source/ExpressionTransformations/WidenTo64bit.cpp

diff --git a/lib/include/rocRoller/AssemblyKernel.hpp b/lib/include/rocRoller/AssemblyKernel.hpp
index ae5db25e..a945b1b9 100644
--- a/lib/include/rocRoller/AssemblyKernel.hpp
+++ b/lib/include/rocRoller/AssemblyKernel.hpp
@@ -128,6 +128,7 @@ namespace rocRoller
          * @param args Vector of CommandArgument pointers that should be added as arguments.
          */
         void                      addCommandArguments(std::vector<CommandArgumentPtr> args);
+        void                      addNewCommandArguments(std::vector<CommandArgumentPtr> args);
         Expression::ExpressionPtr addCommandArgument(CommandArgumentPtr arg);
 
         std::string args_string();
diff --git a/lib/include/rocRoller/AssemblyKernel_impl.hpp b/lib/include/rocRoller/AssemblyKernel_impl.hpp
index 5f721014..ec21f1f9 100644
--- a/lib/include/rocRoller/AssemblyKernel_impl.hpp
+++ b/lib/include/rocRoller/AssemblyKernel_impl.hpp
@@ -202,6 +202,15 @@ namespace rocRoller
         }
     }
 
+    inline void AssemblyKernel::addNewCommandArguments(std::vector<CommandArgumentPtr> args)
+    {
+        for(auto arg : args)
+        {
+            if(m_argumentNames.find(arg->name()) == m_argumentNames.end())
+                addCommandArgument(arg);
+        }
+    }
+
     inline Expression::ExpressionPtr AssemblyKernel::addCommandArgument(CommandArgumentPtr arg)
     {
         return addArgument({arg->name(),
diff --git a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp
index d4df2566..775498f3 100644
--- a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp
+++ b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp
@@ -1,5 +1,6 @@
 #include <rocRoller/CodeGen/Arithmetic/ArithmeticGenerator.hpp>
 #include <rocRoller/CodeGen/Arithmetic/BitwiseAnd.hpp>
+#include <rocRoller/CodeGen/Arithmetic/Utility.hpp>
 #include <rocRoller/Utilities/Component.hpp>
 
 namespace rocRoller
@@ -55,16 +56,33 @@ namespace rocRoller
             }
             else if(elementBits == 64u)
             {
-                co_yield_(Instruction("v_and_b32",
-                                      {dest->subset({0})},
-                                      {lhs->subset({0}), rhs->subset({0})},
-                                      {},
-                                      ""));
-                co_yield_(Instruction("v_and_b32",
-                                      {dest->subset({1})},
-                                      {lhs->subset({1}), rhs->subset({1})},
-                                      {},
-                                      ""));
+                if(lhs->regType() == Register::Type::Literal)
+                {
+
+                    Register::ValuePtr lsb;
+                    Register::ValuePtr msb;
+                    Arithmetic::get2LiteralDwords(lsb, msb, lhs);
+
+                    // subset() is not applicable to NoAllocation Literal type.
+                    co_yield_(Instruction(
+                        "v_and_b32", {dest->subset({0})}, {lsb, rhs->subset({0})}, {}, ""));
+                    co_yield_(Instruction(
+                        "v_and_b32", {dest->subset({1})}, {msb, rhs->subset({1})}, {}, ""));
+                }
+                else
+                {
+
+                    co_yield_(Instruction("v_and_b32",
+                                          {dest->subset({0})},
+                                          {lhs->subset({0}), rhs->subset({0})},
+                                          {},
+                                          ""));
+                    co_yield_(Instruction("v_and_b32",
+                                          {dest->subset({1})},
+                                          {lhs->subset({1}), rhs->subset({1})},
+                                          {},
+                                          ""));
+                }
             }
             else
             {
diff --git a/lib/source/ExpressionTransformations/WidenTo64bit.cpp b/lib/source/ExpressionTransformations/WidenTo64bit.cpp
new file mode 100644
index 00000000..c9408769
--- /dev/null
+++ b/lib/source/ExpressionTransformations/WidenTo64bit.cpp
@@ -0,0 +1,255 @@
+#include <rocRoller/AssemblyKernelArgument.hpp>
+#include <rocRoller/Expression.hpp>
+#include <rocRoller/ExpressionTransformations.hpp>
+
+template <typename T>
+constexpr auto cast_to_unsigned(T val)
+{
+    return static_cast<typename std::make_unsigned<T>::type>(val);
+}
+
+namespace rocRoller
+{
+    namespace Expression
+    {
+        struct WidenTo64BitVisitor
+        {
+
+            ExpressionPtr operator()(Convert const& expr) const
+            {
+                Convert cpy = expr;
+                if(expr.arg)
+                {
+                    // Here is an assumption that call(cpy.arg) never goes above 64-bit and
+                    // input convert's destination types are either int32, uint32, int64 or uint64.
+                    // resultVaribleType(expr.arg) is not called intentionally as it will visit the
+                    // subtree of expr.arg again.
+                    // We can also import similar logic from ExpressionResultTypeVisitor and
+                    // make the operator()(...) return a pair of ExpressionPtr and VariableType
+                    // in order to avoid repeated visit of the subtree of cpy.arg.
+                    cpy.arg = call(expr.arg);
+                    if(expr.destinationType == DataType::UInt32)
+                        return convert(DataType::UInt64, cpy.arg);
+                    else if(expr.destinationType == DataType::Int32)
+                        return convert(DataType::Int64, cpy.arg);
+                    else if(expr.destinationType == DataType::UInt64
+                            || expr.destinationType == DataType::Int64)
+                        return convert(expr.destinationType, cpy.arg);
+                    else
+                    {
+                        AssertFatal(
+                            false,
+                            "Expected Destination type for a convert is either (u)int{32,64}");
+                        return nullptr;
+                    }
+                }
+
+                return std::make_shared<Expression>(cpy);
+            }
+
+            ExpressionPtr operator()(Negate const& expr) const
+            {
+                Negate cpy = expr;
+                if(expr.arg)
+                {
+                    cpy.arg = call(expr.arg);
+                }
+
+                return std::make_shared<Expression>(cpy);
+            }
+
+            template <CUnary Expr>
+            ExpressionPtr operator()(Expr const& expr) const
+            {
+                // Only Convert and perhaps negates are expected.
+                AssertFatal(false, "Unexpected Unary expression", ShowValue(expr));
+                return nullptr;
+            }
+
+            ExpressionPtr operator()(Divide const& expr) const
+            {
+                // TODO: Add check that divisor is a CValue within (u)int32-bit
+                Log::debug("Divisor: {} ", toString(expr.rhs));
+
+                Divide cpy = expr;
+                if(expr.lhs)
+                    cpy.lhs = call(expr.lhs);
+
+                return std::make_shared<Expression>(cpy);
+            }
+
+            ExpressionPtr operator()(Modulo const& expr) const
+            {
+                // TODO: Add check that divisor is a CValue within (u)int32-bit
+                Log::debug("Modulo: {} ", toString(expr.rhs));
+
+                Modulo cpy = expr;
+                if(expr.lhs)
+                    cpy.lhs = call(expr.lhs);
+
+                return std::make_shared<Expression>(cpy);
+            }
+
+            // TODO: for some shifts, logical, subtract operations these may not be correct.
+            //       Not sure yet if address calculation expressions with identity_transducer
+            //       those types of Expression or not.
+            template <CBinary Expr>
+            requires(CArithmetic<Expr>) ExpressionPtr operator()(Expr const& expr) const
+            {
+                if constexpr(std::same_as<Expr, Subtract>)
+                {
+                    AssertFatal(false, "Subtracts are not expected");
+                    return nullptr;
+                }
+
+                if constexpr(CLogical<Expr>)
+                {
+                    AssertFatal(false, "logicals are not expected");
+                    return nullptr;
+                }
+
+                Expr cpy = expr;
+                if(expr.lhs)
+                    cpy.lhs = call(expr.lhs);
+                if(expr.rhs)
+                    cpy.rhs = call(expr.rhs);
+
+                return std::make_shared<Expression>(cpy);
+            }
+
+            // Even with catch-all operator()(ExpressionPtr) without following,
+            // compilation fails.
+            template <typename Expr>
+            requires(CBinary<Expr>) ExpressionPtr operator()(Expr const& expr) const
+            {
+                AssertFatal(false, "Not expected expr : ", ShowValue(expr));
+                return nullptr;
+            }
+
+            template <CTernary Expr>
+            requires(CArithmetic<Expr>) ExpressionPtr operator()(Expr const& expr) const
+            {
+                Expr cpy = expr;
+                if(expr.lhs)
+                    cpy.lhs = call(expr.lhs);
+                if(expr.r1hs)
+                    cpy.r1hs = call(expr.r1hs);
+                if(expr.r2hs)
+                    cpy.r2hs = call(expr.r2hs);
+
+                return std::make_shared<Expression>(cpy);
+            }
+
+            template <typename Expr>
+            requires(CTernary<Expr>) ExpressionPtr operator()(Expr const& expr) const
+            {
+                AssertFatal(false, "Not expected expr : ", ShowValue(expr));
+                return nullptr;
+            }
+
+            // leaves
+            ExpressionPtr operator()(CommandArgumentPtr const& expr) const
+            {
+                Log::debug("CommandArgumentPtr {}", toString(expr));
+
+                auto varType = expr->variableType();
+
+                assertIfNotExpectedType(varType.dataType, toString(expr));
+
+                CommandArgumentPtr cpy = expr;
+                return widenTo64(varType.dataType, cpy);
+            }
+
+            ExpressionPtr operator()(CommandArgumentValue const& expr) const
+            {
+                Log::debug("CommandArgumentValue {} Type {} ",
+                           toString(expr),
+                           toString(variableType(expr)));
+
+                auto varType = variableType(expr);
+
+                assertIfNotExpectedType(varType.dataType, toString(expr));
+
+                CommandArgumentValue cpy = expr;
+                return widenTo64(varType.dataType, cpy);
+            }
+
+            ExpressionPtr operator()(Register::ValuePtr const& expr) const
+            {
+                Log::debug("Register::ValuePtr {}", toString(expr));
+
+                auto varType = expr->variableType();
+
+                assertIfNotExpectedType(varType.dataType, toString(expr));
+
+                Register::ValuePtr cpy = expr;
+                return widenTo64(varType.dataType, cpy);
+            }
+
+            ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const
+            {
+                Log::debug("AssemblyKernelArgumentPtr {} its expression is {}",
+                           toString(expr),
+                           toString(expr->expression));
+
+                auto varType = expr->variableType;
+
+                assertIfNotExpectedType(varType.dataType, toString(expr));
+
+                AssemblyKernelArgumentPtr cpy = expr;
+                return widenTo64(varType.dataType, cpy);
+            }
+
+            // catch the rest CValue
+            template <CValue Value>
+            ExpressionPtr operator()(Value const& expr) const
+            {
+                AssertFatal(
+                    false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr));
+                return nullptr;
+            }
+
+            template <typename T>
+            ExpressionPtr widenTo64(DataType srcType, T const& expr) const
+            {
+                if(srcType == DataType::UInt32)
+                    return convert(DataType::UInt64, std::make_shared<Expression>(expr));
+                else if(srcType == DataType::Int32)
+                    return convert(DataType::Int64, std::make_shared<Expression>(expr));
+
+                return std::make_shared<Expression>(expr);
+            }
+
+            ExpressionPtr call(ExpressionPtr const& expr) const
+            {
+                return std::visit(*this, *expr);
+            }
+
+            void assertIfNotExpectedType(DataType dt, std::string const& showValue) const
+            {
+                AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32
+                                || dt == DataType::UInt64 || dt == DataType::Int64,
+                            "Unexpected DataType for Command/Kernel arguments or "
+                            "workgroup/item indices ",
+                            showValue);
+            }
+        };
+
+        ExpressionPtr widenTo64bit(ExpressionPtr expr)
+        {
+            auto origVarType = resultVariableType(expr);
+
+            auto visitor = WidenTo64BitVisitor();
+            auto widened = visitor.call(expr);
+
+            auto finalVarType = resultVariableType(widened);
+
+            AssertFatal(origVarType.dataType == finalVarType.dataType,
+                        "Original and final data types should be the same",
+                        ShowValue(origVarType.dataType),
+                        ShowValue(finalVarType.dataType));
+
+            return widened;
+        }
+    }
+}

From 6f391efdbb4e53216fee43dee019ad9458e40c45 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Tue, 4 Mar 2025 18:11:36 +0000
Subject: [PATCH 02/32] Buildable minimal

---
 lib/CMakeLists.txt                                    |  1 +
 lib/include/rocRoller/ExpressionTransformations.hpp   | 10 ++++++++++
 lib/source/ExpressionTransformations/WidenTo64bit.cpp |  7 +++++++
 3 files changed, 18 insertions(+)

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 805b7c86..77e2dbe9 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -75,6 +75,7 @@ set(rocroller_src
     source/ExpressionTransformations/LowerPRNG.cpp
     source/ExpressionTransformations/RestoreCommandArguments.cpp
     source/ExpressionTransformations/Simplify.cpp
+    source/ExpressionTransformations/WidenTo64bit.cpp
 
     source/CodeGen/AddInstruction.cpp
     source/CodeGen/ArgumentLoader.cpp
diff --git a/lib/include/rocRoller/ExpressionTransformations.hpp b/lib/include/rocRoller/ExpressionTransformations.hpp
index e4a730ec..55a3a07f 100644
--- a/lib/include/rocRoller/ExpressionTransformations.hpp
+++ b/lib/include/rocRoller/ExpressionTransformations.hpp
@@ -135,5 +135,15 @@ namespace rocRoller
          * @return ExpressionPtr Transformed expression
          */
         ExpressionPtr lowerBitfieldValues(ExpressionPtr expr);
+
+        /**
+         * @brief Widen (u)int32 to (u)int64.
+         *
+         * Has many assumptions in input expr. See the implementation for details.
+         *
+         * @param expr Input expression
+         * @return ExpressionPtr Transformed expression
+         */
+        ExpressionPtr widenTo64bit(ExpressionPtr expr);
     }
 }
diff --git a/lib/source/ExpressionTransformations/WidenTo64bit.cpp b/lib/source/ExpressionTransformations/WidenTo64bit.cpp
index c9408769..f43b07d4 100644
--- a/lib/source/ExpressionTransformations/WidenTo64bit.cpp
+++ b/lib/source/ExpressionTransformations/WidenTo64bit.cpp
@@ -209,6 +209,13 @@ namespace rocRoller
                 return nullptr;
             }
 
+            ExpressionPtr operator()(Expression const& expr) const
+            {
+                AssertFatal(
+                    false, "No expectation to meet this type of Expression: ", ShowValue(expr));
+                return nullptr;
+            }
+
             template <typename T>
             ExpressionPtr widenTo64(DataType srcType, T const& expr) const
             {

From 573b3b04045d772ec842a4b1bcbd9009ba365036 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Tue, 4 Mar 2025 18:49:00 +0000
Subject: [PATCH 03/32] Adding changes on CommandSolution - buildable

---
 lib/include/rocRoller/CommandSolution.hpp | 11 +++++++
 lib/source/CommandSolution.cpp            | 40 +++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/lib/include/rocRoller/CommandSolution.hpp b/lib/include/rocRoller/CommandSolution.hpp
index 9321bc74..d5ecd1dc 100644
--- a/lib/include/rocRoller/CommandSolution.hpp
+++ b/lib/include/rocRoller/CommandSolution.hpp
@@ -167,6 +167,17 @@ namespace rocRoller
          */
         void generateKernel();
 
+        /**
+         * @brief Generates the kernel Graph by graph lowering and
+         * doesn't do code-generation.
+         */
+        void generateKernelGraphOnlyAfterTransforms();
+
+        /**
+         * @brief Lower command arguments to kernel arguments. 
+         */
+        void lowerToKernelArguments();
+
         /**
          * @brief Assembles a generated kernel.  Does not try to load
          * it.
diff --git a/lib/source/CommandSolution.cpp b/lib/source/CommandSolution.cpp
index de5aae79..679002f9 100644
--- a/lib/source/CommandSolution.cpp
+++ b/lib/source/CommandSolution.cpp
@@ -104,6 +104,8 @@ namespace rocRoller
 
         rv.reserve(m_context->kernel()->argumentSize(), argStructs.size());
 
+        Log::debug("== getKernelArguments ==");
+
         for(auto& arg : argStructs)
         {
             auto value = Expression::evaluate(arg.expression, args);
@@ -120,6 +122,8 @@ namespace rocRoller
                                                      arg.name));
             }
 
+            Log::debug(" arg.name {} value {}", arg.name, toString(value));
+
             rv.append(arg.name, value);
         }
 
@@ -136,11 +140,21 @@ namespace rocRoller
 
         auto const& workitems = m_context->kernel()->workitemCount();
         if(workitems[0])
+        {
             rv.workitemCount[0] = getUnsignedInt(evaluate(workitems[0], args));
+            Log::debug("== getKernelInvocation ==");
+            Log::debug(" workitemCount[0] {}", rv.workitemCount[0]);
+        }
         if(workitems[1])
+        {
             rv.workitemCount[1] = getUnsignedInt(evaluate(workitems[1], args));
+            Log::debug(" workitemCount[1] {}", rv.workitemCount[1]);
+        }
         if(workitems[2])
+        {
             rv.workitemCount[2] = getUnsignedInt(evaluate(workitems[2], args));
+            Log::debug(" workitemCount[2] {}", rv.workitemCount[2]);
+        }
 
         auto const& sharedMem = m_context->kernel()->dynamicSharedMemBytes();
         if(sharedMem)
@@ -186,6 +200,15 @@ namespace rocRoller
         co_yield Instruction::Comment(m_command->argInfo());
     }
 
+    void CommandKernel::lowerToKernelArguments()
+    {
+        for(auto arg : m_command->getArguments())
+        {
+            Log::debug("command argument: {}, {}", arg->toString(), toString(arg->expression()));
+        }
+        m_context->kernel()->addNewCommandArguments(m_command->getArguments());
+    }
+
     void CommandKernel::generateKernelGraph(std::string name)
     {
         TIMER(t, "CommandKernel::generateKernelGraph");
@@ -362,6 +385,23 @@ namespace rocRoller
             generateKernelSource();
         }
         else
+        {
+            Log::debug("generateKernel() is doing nothing");
+            // Probably from a unit test.  The context should contain
+            // scheduled instructions already.
+        }
+    }
+
+    void CommandKernel::generateKernelGraphOnlyAfterTransforms()
+    {
+        TIMER(t, "CommandKernel::generateKernelGraphOnlyAfterTransforms()");
+
+        if(m_command)
+        {
+            // Only lower the KernelGraph and don't generate codes.
+            generateKernelGraph(m_name);
+        }
+        else
         {
             // Probably from a unit test.  The context should contain
             // scheduled instructions already.

From f18cfbff91389ee3cdc95cf8beb7153c011ff5bf Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Tue, 4 Mar 2025 18:56:19 +0000
Subject: [PATCH 04/32] Adding default <=> operator to GEMMProblem - buildable

---
 test/common/common/GEMMProblem.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/common/common/GEMMProblem.hpp b/test/common/common/GEMMProblem.hpp
index 5d6fe552..764b9056 100644
--- a/test/common/common/GEMMProblem.hpp
+++ b/test/common/common/GEMMProblem.hpp
@@ -66,4 +66,6 @@ struct GEMMProblem
 
     rocRoller::Operations::ScaleMode scaleAMode = rocRoller::Operations::ScaleMode::None;
     rocRoller::Operations::ScaleMode scaleBMode = rocRoller::Operations::ScaleMode::None;
+
+    auto operator<=>(GEMMProblem const& rhs) const = default;
 };

From b7baa4a6c7ed38bd4df1923d6c4676eed1b8d546 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Tue, 4 Mar 2025 22:02:06 +0000
Subject: [PATCH 05/32] Before adding AddressCalculationTest.cpp - buildable

---
 test/catch/CommandTest.cpp                    |   8 +-
 test/catch/IdentifyParallelDimensionsTest.cpp |   4 +-
 test/common/CommonGraphs.cpp                  | 239 ++++++++++++++++++
 test/common/common/CommonGraphs.hpp           |  24 +-
 test/common/common/CommonGraphs_impl.hpp      |   2 +
 test/unit/KernelGraphTest.cpp                 |   6 +-
 .../unit/KernelGraphTest/UpdateParameters.cpp |   2 +-
 7 files changed, 272 insertions(+), 13 deletions(-)

diff --git a/test/catch/CommandTest.cpp b/test/catch/CommandTest.cpp
index b2e721a0..b0bb4ae4 100644
--- a/test/catch/CommandTest.cpp
+++ b/test/catch/CommandTest.cpp
@@ -13,9 +13,9 @@ namespace CommandTest
     {
         SECTION("GEMM/TileAdd")
         {
-            auto example1 = *rocRollerTest::Graphs::GEMM<float>().getCommand();
-            auto example2 = *rocRollerTest::Graphs::GEMM<float>().getCommand();
-            auto example3 = *rocRollerTest::Graphs::GEMM<Half>().getCommand();
+            auto example1 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand();
+            auto example2 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand();
+            auto example3 = *rocRollerTest::Graphs::GEMM(DataType::Half).getCommand();
             auto example4 = *rocRollerTest::Graphs::TileDoubleAdd<Half>().getCommand();
 
             CHECK(example1 == example2);
@@ -33,7 +33,7 @@ namespace CommandTest
     {
         SECTION("GEMM")
         {
-            auto example = rocRollerTest::Graphs::GEMM<float>();
+            auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
             auto command0 = example.getCommand();
             auto yaml     = Command::toYAML(*command0);
diff --git a/test/catch/IdentifyParallelDimensionsTest.cpp b/test/catch/IdentifyParallelDimensionsTest.cpp
index 5c673572..25c1d378 100644
--- a/test/catch/IdentifyParallelDimensionsTest.cpp
+++ b/test/catch/IdentifyParallelDimensionsTest.cpp
@@ -48,7 +48,7 @@ TEST_CASE("identifyParallelDimensionSets works for GEMM", "[kernel-graph]")
     using namespace rocRoller;
     auto ctx = TestContext::ForDefaultTarget();
 
-    auto example = rocRollerTest::Graphs::GEMM<float>();
+    auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
     auto kgraph = KernelGraph::translate(example.getCommand());
 
@@ -149,7 +149,7 @@ SCENARIO("IdentifyParallelDimensions transformation works for GEMM", "[kernel-gr
     using namespace rocRoller;
     auto ctx = TestContext::ForDefaultTarget();
 
-    auto example = rocRollerTest::Graphs::GEMM<float>();
+    auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
     GIVEN("The initial kernel graph for a GEMM")
     {
diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp
index 338d1a10..294b37f4 100644
--- a/test/common/CommonGraphs.cpp
+++ b/test/common/CommonGraphs.cpp
@@ -169,4 +169,243 @@ namespace rocRollerTest::Graphs
         return params;
     }
 
+    GEMM::GEMM(DataType ta)
+        : GEMM(ta, ta)
+    {
+    }
+    GEMM::GEMM(DataType ta, DataType tb)
+        : GEMM(ta, tb, tb)
+    {
+    }
+    GEMM::GEMM(DataType ta, DataType tb, DataType tc)
+        : GEMM(ta, tb, tc, tc)
+    {
+    }
+    GEMM::GEMM(DataType ta, DataType tb, DataType tc, DataType td)
+        : m_ta(ta)
+        , m_tb(tb)
+        , m_tc(tc)
+        , m_td(td)
+    {
+    }
+
+    void GEMM::createCommand()
+    {
+        m_command = std::make_shared<rocRoller::Command>();
+
+        std::vector<size_t> oneStridesN
+            = m_problem.literalStrides ? std::vector<size_t>({(size_t)1}) : std::vector<size_t>({});
+
+        std::vector<size_t> oneStridesT = m_problem.literalStrides
+                                              ? std::vector<size_t>({(size_t)0, (size_t)1})
+                                              : std::vector<size_t>({});
+
+        m_tagTensorA = m_command->addOperation(rocRoller::Operations::Tensor(
+            2, m_ta, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A
+
+        m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorA));
+
+        m_tagTensorB = m_command->addOperation(rocRoller::Operations::Tensor(
+            2, m_tb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B
+        m_tagB       = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorB));
+
+        m_tagTensorC
+            = m_command->addOperation(rocRoller::Operations::Tensor(2, m_tc, oneStridesN)); // C
+        m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorC));
+
+        m_tagScalarAlpha
+            = m_command->addOperation(rocRoller::Operations::Scalar(DataType::Float)); // alpha
+        auto tagLoadAlpha
+            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(m_tagScalarAlpha));
+
+        m_tagScalarBeta  = m_command->addOperation(rocRoller::Operations::Scalar(m_tc)); // beta
+        auto tagLoadBeta = m_command->addOperation(
+            rocRoller::Operations::T_Load_Scalar(m_tagScalarBeta)); // beta
+
+        auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B
+
+        rocRoller::Operations::T_Execute execute(m_command->getNextTag());
+
+        auto tagBetaC
+            = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadBeta, m_tagC)); // beta * C
+
+        auto tagAlphaAB
+            = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadAlpha, tagAB)); // alpha * (A * B)
+
+        if(m_problem.betaInFma)
+        {
+            m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagBetaC, tagAlphaAB));
+            // alpha * (A * B) + beta * C
+        }
+        else
+        {
+            m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagAlphaAB, tagBetaC));
+            // alpha * (A * B) + beta * C
+        }
+        m_command->addOperation(std::move(execute));
+
+        m_tagTensorD
+            = m_command->addOperation(rocRoller::Operations::Tensor(2, m_td, oneStridesN)); // D
+        m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, m_tagTensorD)); // D
+
+        if(m_problem.streamK)
+        {
+            m_tagNumWGs    = m_command->allocateTag();
+            auto numWGsArg = m_command->allocateArgument(DataType::UInt32,
+                                                         m_tagNumWGs,
+                                                         ArgumentType::Value,
+                                                         DataDirection::ReadOnly,
+                                                         rocRoller::NUMWGS);
+        }
+
+        m_tagScratch = m_command->allocateTag();
+        m_command->allocateArgument(VariableType(DataType::UInt32, PointerType::PointerGlobal),
+                                    m_tagScratch,
+                                    ArgumentType::Value,
+                                    DataDirection::ReadWrite,
+                                    rocRoller::SCRATCH);
+    }
+
+    CommandPtr GEMM::getCommand()
+    {
+        if(!m_command)
+            createCommand();
+
+        return m_command;
+    }
+
+    KernelGraph GEMM::getKernelGraph()
+    {
+        return rocRoller::KernelGraph::translate(getCommand());
+    }
+
+    void GEMM::setTileSize(int m, int n, int k)
+    {
+        m_problem.macM = m;
+        m_problem.macN = n;
+        m_problem.macK = k;
+    }
+
+    void GEMM::setMFMA(int m, int n, int k, int b)
+    {
+        m_problem.waveM = m;
+        m_problem.waveN = n;
+        m_problem.waveK = k;
+        m_problem.waveB = b;
+    }
+
+    void GEMM::setUseLDS(bool a, bool b, bool d)
+    {
+        m_problem.loadLDSA  = a;
+        m_problem.loadLDSB  = b;
+        m_problem.storeLDSD = d;
+    }
+
+    void GEMM::setPrefetch(bool prefetch,
+                           int  prefetchInFlight,
+                           int  prefetchLDSFactor,
+                           bool prefetchMixMemOps)
+    {
+        m_problem.prefetch          = prefetch;
+        m_problem.prefetchInFlight  = prefetchInFlight;
+        m_problem.prefetchLDSFactor = prefetchLDSFactor;
+        m_problem.prefetchMixMemOps = prefetchMixMemOps;
+
+        m_problem.unrollK = prefetchInFlight;
+    }
+
+    void GEMM::setProblem(GEMMProblem const& problem)
+    {
+        m_problem = problem;
+    }
+
+    GEMMProblem const& GEMM::getProblem() const
+    {
+        return m_problem;
+    }
+
+    CommandParametersPtr GEMM::getCommandParameters() const
+    {
+        using namespace rocRoller::KernelGraph::CoordinateGraph;
+
+        auto params = std::make_shared<CommandParameters>();
+
+        params->setManualKernelDimension(2);
+
+        AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0,
+                    "Workgroup Size X must be multiply of wave front size");
+
+        uint wavetilePerWavefrontM
+            = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX;
+        uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY;
+
+        AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0,
+                    "WaveTile size mismatch (M)");
+        AssertFatal(m_problem.macN % (m_problem.waveN * wavetilePerWavefrontN) == 0,
+                    "WaveTile size mismatch (N)");
+
+        uint workgroupSizeX = m_problem.workgroupSizeX * m_problem.workgroupSizeY;
+        uint workgroupSizeY = 1;
+        params->setManualWorkgroupSize({workgroupSizeX, workgroupSizeY, 1});
+
+        auto macTileA
+            = MacroTile({m_problem.macM, m_problem.macK},
+                        LayoutType::MATRIX_A,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB},
+                        m_problem.loadLDSA ? MemoryType::LDS : MemoryType::WAVE);
+        auto macTileB
+            = MacroTile({m_problem.macK, m_problem.macN},
+                        LayoutType::MATRIX_B,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB},
+                        m_problem.loadLDSB ? MemoryType::LDS : MemoryType::WAVE);
+        auto macTileC
+            = MacroTile({m_problem.macM, m_problem.macN},
+                        LayoutType::MATRIX_ACCUMULATOR,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB});
+        auto macTileD
+            = MacroTile({m_problem.macM, m_problem.macN},
+                        LayoutType::MATRIX_ACCUMULATOR,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB},
+                        m_problem.storeLDSD ? MemoryType::LDS : MemoryType::WAVE);
+
+        params->setDimensionInfo(m_tagA, macTileA);
+        params->setDimensionInfo(m_tagB, macTileB);
+        params->setDimensionInfo(m_tagC, macTileC);
+        params->setDimensionInfo(m_tagD, macTileD);
+
+        // uint jammedM
+        //     = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX;
+        // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY;
+
+        Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1);
+        // Log::debug("GEMM jamming {} {}", jammedM, jammedN);
+        // params->setWaveTilesPerWavefront(jammedM, jammedN);
+
+        params->setManualWavefrontCount(
+            {static_cast<uint>(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM),
+             static_cast<uint>(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)});
+
+        params->fuseLoops                     = m_problem.fuseLoops;
+        params->tailLoops                     = m_problem.tailLoops;
+        params->allowAmbiguousMemoryNodes     = m_problem.allowAmbiguousMemoryNodes;
+        params->unrollK                       = m_problem.unrollK;
+        params->packMultipleElementsInto1VGPR = m_problem.packMultipleElementsInto1VGPR;
+        params->prefetch                      = m_problem.prefetch;
+        params->prefetchInFlight              = m_problem.prefetchInFlight;
+        params->prefetchLDSFactor             = m_problem.prefetchLDSFactor;
+        params->prefetchMixMemOps             = m_problem.prefetchMixMemOps;
+        params->transposeMemoryAccess[LayoutType::MATRIX_A] = m_problem.transA == "T";
+        params->transposeMemoryAccess[LayoutType::MATRIX_B] = m_problem.transB == "T";
+        params->transposeMemoryAccess[LayoutType::None]     = true;
+
+        if(m_problem.streamK)
+        {
+            params->loopOverOutputTilesDimensions = {0, 1};
+            params->streamK                       = true;
+            params->streamKTwoTile                = m_problem.streamKTwoTile;
+        }
+
+        return params;
+    }
+
 }
diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp
index 5dd94935..7485c5cc 100644
--- a/test/common/common/CommonGraphs.hpp
+++ b/test/common/common/CommonGraphs.hpp
@@ -14,6 +14,8 @@
 #include <rocRoller/Operations/Command_fwd.hpp>
 #include <rocRoller/Operations/OperationTag.hpp>
 
+#include <common/GEMMProblem.hpp>
+
 namespace rocRollerTest
 {
     namespace Graphs
@@ -26,6 +28,7 @@ namespace rocRollerTest
         using ContextPtr                 = rocRoller::ContextPtr;
         using KernelArguments            = rocRoller::KernelArguments;
         using KernelGraph                = rocRoller::KernelGraph::KernelGraph;
+        using DataType                   = rocRoller::DataType;
 
         /**
          * @brief Graph for linear: alpha x + beta y.
@@ -152,11 +155,13 @@ namespace rocRollerTest
          * - Assign(D = alpha * AB + beta * C)
          * - StoreTiled(D)
          */
-        template <typename T>
         class GEMM
         {
         public:
-            GEMM();
+            GEMM(DataType ta);
+            GEMM(DataType ta, DataType tb);
+            GEMM(DataType ta, DataType tb, DataType tc);
+            GEMM(DataType ta, DataType tb, DataType tc, DataType td);
 
             CommandPtr  getCommand();
             KernelGraph getKernelGraph();
@@ -164,9 +169,21 @@ namespace rocRollerTest
             void setTileSize(int m, int n, int k);
             void setMFMA(int m, int n, int k, int b);
             void setUseLDS(bool a, bool b, bool d);
+            void setPrefetch(bool prefetch,
+                             int  prefetchInFlight,
+                             int  prefetchLDSFactor,
+                             bool prefetchMixMemOps);
+            void setProblem(GEMMProblem const& problem);
 
+            GEMMProblem const&   getProblem() const;
             CommandParametersPtr getCommandParameters() const;
 
+            rocRoller::Operations::OperationTag m_tagTensorA, m_tagTensorB, m_tagTensorC,
+                m_tagTensorD, m_tagScalarAlpha, m_tagScalarBeta, m_tagScalarSeed, m_tagScratch;
+            rocRoller::Operations::OperationTag m_tagNumWGs;
+
+            DataType m_ta, m_tb, m_tc, m_td;
+
         private:
             void createCommand();
 
@@ -176,7 +193,8 @@ namespace rocRollerTest
 
             rocRoller::Operations::OperationTag m_tagA, m_tagB, m_tagC, m_tagD;
 
-            CommandPtr m_command;
+            CommandPtr  m_command;
+            GEMMProblem m_problem;
         };
 
         /**
diff --git a/test/common/common/CommonGraphs_impl.hpp b/test/common/common/CommonGraphs_impl.hpp
index 22776c4f..a6232cc8 100644
--- a/test/common/common/CommonGraphs_impl.hpp
+++ b/test/common/common/CommonGraphs_impl.hpp
@@ -231,6 +231,7 @@ namespace rocRollerTest::Graphs
         return rocRoller::KernelGraph::translate(m_command);
     }
 
+#if 0
     /*
      * GEMM
      */
@@ -356,6 +357,7 @@ namespace rocRollerTest::Graphs
 
         return params;
     }
+#endif
 
     /*
      * TileDoubleAdd
diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp
index 30e60e3b..e3c28ee4 100644
--- a/test/unit/KernelGraphTest.cpp
+++ b/test/unit/KernelGraphTest.cpp
@@ -1060,7 +1060,7 @@ namespace KernelGraphTest
 
     TEST_F(KernelGraphTest, LowerTensor)
     {
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         int macK  = 16;
         int waveK = 8;
@@ -1152,7 +1152,7 @@ namespace KernelGraphTest
 
     TEST_F(KernelGraphTest, InlineIncrement)
     {
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         example.setTileSize(128, 256, 8);
         example.setMFMA(32, 32, 2, 1);
@@ -2704,7 +2704,7 @@ namespace KernelGraphTest
     {
         using GD = Graph::Direction;
 
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         example.setTileSize(128, 256, 8);
         example.setMFMA(32, 32, 2, 1);
diff --git a/test/unit/KernelGraphTest/UpdateParameters.cpp b/test/unit/KernelGraphTest/UpdateParameters.cpp
index 80e78b8d..cc7e6a98 100644
--- a/test/unit/KernelGraphTest/UpdateParameters.cpp
+++ b/test/unit/KernelGraphTest/UpdateParameters.cpp
@@ -62,7 +62,7 @@ namespace KernelGraphTest
     {
         using namespace rocRoller::KernelGraph;
 
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         int macK  = 16;
         int waveK = 8;

From 409496e7991060cbfad818cc301628c310511e7d Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Tue, 4 Mar 2025 22:27:37 +0000
Subject: [PATCH 06/32] Adding AddressCalculationTest.cpp - buildable/testable

---
 test/CMakeLists.txt                   |   1 +
 test/catch/AddressCalculationTest.cpp | 900 ++++++++++++++++++++++++++
 2 files changed, 901 insertions(+)
 create mode 100644 test/catch/AddressCalculationTest.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0000cecd..2caf0905 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -168,6 +168,7 @@ set(
     catch/CustomAssertions.cpp
     catch/TestKernels.cpp
 
+    catch/AddressCalculationTest.cpp
     catch/BinaryExpressionTest.cpp
     catch/CommandArgumentValueTest.cpp
     catch/CommandTest.cpp
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
new file mode 100644
index 00000000..c60e6494
--- /dev/null
+++ b/test/catch/AddressCalculationTest.cpp
@@ -0,0 +1,900 @@
+
+#include "CustomMatchers.hpp"
+#include "TestContext.hpp"
+#include <common/CommonGraphs.hpp>
+#include <common/GEMMProblem.hpp>
+#include <common/Utilities.hpp>
+#include <rocRoller/CodeGen/ArgumentLoader.hpp>
+#include <rocRoller/CodeGen/MemoryInstructions.hpp>
+#include <rocRoller/CodeGen/WaitCount.hpp>
+#include <rocRoller/ExpressionTransformations.hpp>
+#include <rocRoller/KernelGraph/CoordinateGraph/Transformer.hpp>
+#include <rocRoller/KernelGraph/KernelGraph.hpp>
+#include <rocRoller/KernelGraph/Utils.hpp>
+#include <rocRoller/TensorDescriptor.hpp>
+
+#include <catch2/catch_test_macros.hpp>
+
+using namespace rocRoller;
+
+namespace AddressCalculationTest
+{
+    namespace KernelG     = rocRoller::KernelGraph;
+    namespace ControlG    = rocRoller::KernelGraph::ControlGraph;
+    namespace CoordG      = rocRoller::KernelGraph::CoordinateGraph;
+    using KernelGraphType = typename rocRoller::KernelGraph::KernelGraph;
+
+    class AddressTrace
+    {
+    public:
+        AddressTrace(KernelGraphType const& graph, ContextPtr ctx)
+            : m_kGraph(graph)
+            , m_context(ctx){};
+        std::vector<Expression::ExpressionPtr> traceComputeIndexWithBuffer();
+
+    private:
+        KernelGraphType m_kGraph;
+        ContextPtr      m_context;
+    };
+
+    std::vector<Expression::ExpressionPtr> AddressTrace::traceComputeIndexWithBuffer()
+    {
+        auto isComputeIndex = [&](int tag) {
+            return isOperation<ControlG::ComputeIndex>(this->m_kGraph.control.getElement(tag));
+        };
+
+        std::vector<Expression::ExpressionPtr> rv;
+        auto                                   root  = m_kGraph.control.roots().only();
+        int                                    count = 0;
+        for(auto ciTag : filter(isComputeIndex, m_kGraph.control.depthFirstVisit(root.value())))
+        {
+
+            auto maybeCi = m_kGraph.control.get<ControlG::ComputeIndex>(ciTag);
+            AssertFatal(maybeCi.has_value());
+            auto ci = maybeCi.value();
+
+            auto buffer
+                = m_kGraph.mapper.get(ciTag,
+                                      KernelG::Connections::ComputeIndex{
+                                          KernelG::Connections::ComputeIndexArgument::BUFFER});
+            if(buffer == -1)
+                continue;
+
+            auto base = m_kGraph.mapper.get(ciTag,
+                                            KernelG::Connections::ComputeIndex{
+                                                KernelG::Connections::ComputeIndexArgument::BASE});
+
+            // Currently, only base < 0 case is being covered.
+            if(base >= 0)
+                continue;
+
+            {
+                // Debugging log
+                if(m_context->kernel())
+                    Log::debug("kernel is non-null \n");
+
+                auto const& kernelWorkgroupIndices = m_context->kernel()->workgroupIndex();
+                Log::debug("Size of kernelWorkGroupIndices: {}", kernelWorkgroupIndices.size());
+            }
+
+            auto offset
+                = m_kGraph.mapper.get(ciTag,
+                                      KernelG::Connections::ComputeIndex{
+                                          KernelG::Connections::ComputeIndexArgument::OFFSET});
+            auto stride
+                = m_kGraph.mapper.get(ciTag,
+                                      KernelG::Connections::ComputeIndex{
+                                          KernelG::Connections::ComputeIndexArgument::STRIDE});
+            auto target
+                = m_kGraph.mapper.get(ciTag,
+                                      KernelG::Connections::ComputeIndex{
+                                          KernelG::Connections::ComputeIndexArgument::TARGET});
+            auto increment
+                = m_kGraph.mapper.get(ciTag,
+                                      KernelG::Connections::ComputeIndex{
+                                          KernelG::Connections::ComputeIndexArgument::INCREMENT});
+
+            // Note that identity_transduer is intentionally being used here in place of FastArithmetic.
+            // It is alright to use FastArithmetic here but
+            // fastArithmetic is being used eventually when the expressions are generated.
+            auto identity_transducer = [&](auto expr) { return expr; };
+            auto coords              = CoordG::Transformer(
+                std::make_shared<rocRoller::KernelGraph::CoordinateGraph::CoordinateGraph>(
+                    m_kGraph.coordinates),
+                m_context,
+                identity_transducer);
+
+            auto fullStop  = [&](int tag) { return tag == increment; };
+            auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream;
+            auto [required, path] = findRequiredCoordinates(target, direction, fullStop, m_kGraph);
+
+            for(auto tag : required)
+                if((tag != increment) && (!coords.hasCoordinate(tag)))
+                    coords.setCoordinate(tag, Expression::literal(0u));
+
+            // Set the increment coordinate to zero if it doesn't
+            // already have a value
+            bool initializeIncrement = !coords.hasPath({target}, ci.forward);
+            if(initializeIncrement)
+            {
+                coords.setCoordinate(increment, Expression::literal(0u));
+            }
+
+            // Compute an offset address if we don't have an
+            // associated base address to inherit from
+            {
+                // base < 0 by the time control reacheds here.
+                auto indexExpr
+                    = ci.forward ? coords.forward({target})[0] : coords.reverse({target})[0];
+
+                rv.push_back(indexExpr);
+
+                // Rests are for logging for debugging.
+                Log::debug("ci.forward for tag {} dir {}, stride {}, buffer {}",
+                           ciTag,
+                           ci.forward,
+                           stride,
+                           buffer);
+                Log::debug("IndexExpr base < 0 for tag {} in the original graph {}",
+                           ciTag,
+                           toString(indexExpr));
+
+                // Buffer Descriptor's base
+                auto user = m_kGraph.coordinates.get<CoordG::User>(target);
+                if(user)
+                {
+                    Log::debug("User for tag {} {}, offset {}",
+                               ciTag,
+                               ShowValue(target),
+                               (user->offset ? "yes" : "no"));
+                    Log::debug("argument name {}, real name {}",
+                               user->argumentName,
+                               m_context->kernel()->findArgument(user->argumentName).name);
+
+                    // 1. user->argumentName + user->offset if any --> set to base of BufferDesc
+                    //   (Expression)
+                    // 2. user->size --> set to size field (32-bit) of BuferDesc
+                }
+            }
+            // book-keeping for debugging purpose
+            count++;
+        }
+
+        Log::debug("Count of computeIndex investigated: {}", count);
+
+        return rv;
+    }
+
+    struct AddressCalculationTest
+    {
+        /**
+         * gemmGraph should be a graph initialized by prob
+         */
+        AddressCalculationTest(rocRoller::ContextPtr       context,
+                               GEMMProblem const&          prob,
+                               rocRollerTest::Graphs::GEMM gemmGraph)
+            : m_context(context)
+            , m_problem(prob)
+            , m_gemmGraph(gemmGraph)
+        {
+        }
+
+        bool check_uint32_overflow(uint a, uint b)
+        {
+            uint64_t prod = static_cast<uint64_t>(a) * b;
+            return prod > static_cast<uint64_t>(std::numeric_limits<uint32_t>::max());
+        }
+
+        std::pair<std::array<Expression::ExpressionPtr, 3>, uint>
+            getWorkItemCount(CommandParametersPtr params, GEMMProblem const& problem)
+        {
+            int M = problem.m;
+            int N = problem.n;
+            int K = problem.k;
+
+            AssertFatal(M > 0 && N > 0 && K > 0);
+
+            auto workGroupSizes = params->getManualWorkgroupSize();
+
+            AssertFatal(workGroupSizes.has_value());
+            auto workgroupSizeX = workGroupSizes.value()[0];
+            auto workgroupSizeY = workGroupSizes.value()[1];
+
+            // compute NumWorkGroups
+            uint numWorkgroupX;
+            uint numWorkgroupY;
+
+            if(problem.loopOverTiles > 0)
+            {
+                // multiple output macro tiles per workgroup
+                numWorkgroupX = M * N / problem.macM / problem.macN / 2;
+                numWorkgroupY = 1;
+            }
+            else if(problem.streamK)
+            {
+                numWorkgroupX = problem.numWGs;
+                numWorkgroupY = 1;
+            }
+            else
+            {
+                // one output macro tile per workgroup
+                numWorkgroupX = M / problem.macM;
+                numWorkgroupY = N / problem.macN;
+            }
+
+            AssertFatal(!check_uint32_overflow(numWorkgroupX, workgroupSizeX));
+            AssertFatal(!check_uint32_overflow(numWorkgroupY, workgroupSizeY));
+
+            auto NX_literal = numWorkgroupX * workgroupSizeX;
+            auto NY_literal = numWorkgroupY * workgroupSizeY;
+
+            auto NX = std::make_shared<Expression::Expression>(NX_literal);
+            auto NY = std::make_shared<Expression::Expression>(NY_literal);
+            auto NZ = std::make_shared<Expression::Expression>(1u);
+
+            auto totalWorkitemCounts = NX_literal * NY_literal;
+            {
+                Log::debug("Calculated workitemcount[0]: {}", toString(NX));
+                Log::debug("Calculated workitemcount[1]: {}", toString(NY));
+                Log::debug("Calculated workitemcount[2]: {}", toString(NZ));
+                Log::debug("totalWorkitemCounts: {}", totalWorkitemCounts);
+            }
+
+            return {{NX, NY, NZ}, totalWorkitemCounts};
+        }
+
+        static Expression::ExpressionPtr
+            get64BitVectorOffset(ContextPtr                                      context,
+                                 std::array<Expression::ExpressionPtr, 3> const& workitemCount,
+                                 std::array<unsigned int, 3> const&              workgroupSize)
+        {
+            std::array<Expression::ExpressionPtr, 3> thread_index;
+            for(int i = 0; i < 3; i++)
+                thread_index[i] = std::make_shared<Expression::Expression>(
+                    context->kernel()->workitemIndex()[i]);
+
+            std::array<Expression::ExpressionPtr, 3> workgroup_index;
+            for(int i = 0; i < 3; i++)
+                workgroup_index[i] = std::make_shared<Expression::Expression>(
+                    context->kernel()->workgroupIndex()[i]);
+
+            std::array<Expression::ExpressionPtr, 3> workgroup_size;
+            for(int i = 0; i < 3; i++)
+                workgroup_size[i] = std::make_shared<Expression::Expression>(
+                    Register::Value::Literal<unsigned int>(workgroupSize[i]));
+
+            auto idx_x = thread_index[0] + workgroup_index[0] * workgroup_size[0];
+            auto idx_y = thread_index[1] + workgroup_index[1] * workgroup_size[1];
+
+            auto compare_res_pointer = idx_x + idx_y * workitemCount[0];
+            auto elementSize         = std::make_shared<Expression::Expression>(
+                Register::Value::Literal<int32_t>(sizeof(uint64_t)));
+            compare_res_pointer = compare_res_pointer * elementSize;
+
+            Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+            return compare_res_pointer;
+        }
+
+        // This is for printing out workgroup index and thread index.
+        auto kb_sanity_indices(ContextPtr                                      context,
+                               std::array<Expression::ExpressionPtr, 3> const& workitemCount,
+                               std::array<unsigned int, 3> const&              workgroupSize)
+        {
+            return [context, workitemCount, workgroupSize]() -> Generator<Instruction> {
+                // store base addr
+                Register::ValuePtr s_ptr;
+                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
+
+                Register::ValuePtr s_ptr2;
+                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
+
+                auto compare_res_pointer
+                    = get64BitVectorOffset(context, workitemCount, workgroupSize);
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset_1 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_1, compare_res_pointer + s_ptr->expression(), context);
+
+                Register::ValuePtr v_offset_2 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_2, compare_res_pointer + s_ptr2->expression(), context);
+
+                // workgroupIndex x
+                auto v_wg_x = Register::Value::Placeholder(
+                    context, Register::Type::Vector, DataType::UInt32, 1);
+                co_yield context->copier()->copy(
+                    v_wg_x, context->kernel()->workgroupIndex()[0], "copy wgi.x to v");
+                co_yield context->mem()->storeGlobal(v_offset_1, v_wg_x, 0, 4);
+
+                co_yield context->mem()->storeGlobal(
+                    v_offset_2, (context->kernel()->workitemIndex())[0], 0, 4);
+            };
+        }
+
+        // Just print out passed expressions to device memory.
+        // If passed expressions are workitemCounts, the expectation is that
+        // the generated expressions' values are the same with the host-side computed values.
+        auto kb_implicit_workitemcount(ContextPtr                         context,
+                                       Expression::ExpressionPtr const&   workitemcount_X,
+                                       Expression::ExpressionPtr const&   workitemcount_Y,
+                                       std::array<unsigned int, 3> const& workgroupSize)
+        {
+            return [context,
+                    workitemcount_X,
+                    workitemcount_Y,
+                    workgroupSize]() -> Generator<Instruction> {
+                // store base addrs
+                Register::ValuePtr s_ptr;
+                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
+                Register::ValuePtr s_ptr2;
+                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
+                auto compare_res_pointer = get64BitVectorOffset(
+                    context, {workitemcount_X, workitemcount_Y}, workgroupSize);
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset_1 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_1, compare_res_pointer + s_ptr->expression(), context);
+
+                Register::ValuePtr v_offset_2 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_2, compare_res_pointer + s_ptr2->expression(), context);
+
+                // Compute the value 1
+                Register::ValuePtr s_value_1 = nullptr;
+                co_yield Expression::generate(s_value_1, workitemcount_X, context);
+                auto v_value_11 = Register::Value::Placeholder(
+                    context, Register::Type::Vector, DataType::Int64, 1);
+                co_yield context->copier()->copy(v_value_11, s_value_1, "copy to v1");
+
+                // Compute the value 2
+                Register::ValuePtr s_value_2 = nullptr;
+                co_yield Expression::generate(s_value_2, workitemcount_Y, context);
+                auto v_value_22 = Register::Value::Placeholder(
+                    context, Register::Type::Vector, DataType::Int64, 1);
+                co_yield context->copier()->copy(v_value_22, s_value_2, "copy to v2");
+
+                co_yield context->mem()->storeGlobal(v_offset_1, v_value_11, 0, 8);
+                co_yield context->mem()->storeGlobal(v_offset_2, v_value_22, 0, 8);
+            };
+        }
+
+        auto kb_equal(ContextPtr                                      context,
+                      const std::vector<Expression::ExpressionPtr>&   input,
+                      const std::vector<Expression::ExpressionPtr>&   widenedInput,
+                      std::array<Expression::ExpressionPtr, 3> const& workitemCount,
+                      std::array<unsigned int, 3> const&              workgroupSize)
+        {
+            auto allone_uint64
+                = std::make_shared<Expression::Expression>(static_cast<uint64_t>(0xFFFFFFFF));
+
+            return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]()
+                       -> Generator<Instruction> {
+                // store base addr
+                Register::ValuePtr s_ptr;
+
+                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
+
+                Register::ValuePtr s_ptr2;
+
+                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
+
+                // 2-D
+                auto compare_res_pointer
+                    = get64BitVectorOffset(context, workitemCount, workgroupSize);
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset = nullptr;
+                co_yield Expression::generate(
+                    v_offset, compare_res_pointer + s_ptr->expression(), context);
+
+                // may not be needed.
+                //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug")));
+
+                // boolean_diff was allocated to s[0:1]
+                // diff should be computed per lane,
+                // but is_zero_diff is actually one-bit
+                auto               boolean_true = Register::Value::WavefrontPlaceholder(context);
+                Register::ValuePtr v_allone;
+                co_yield Expression::generate(v_allone, allone_uint64, context);
+
+                co_yield context->copier()->copy(
+                    boolean_true, v_allone, "set to true for all lanes");
+
+                Register::ValuePtr temp_res;
+                for(int i = 0, size = input.size(); i < size; i++)
+                {
+                    // Compute the value 1
+                    Register::ValuePtr v_value_1 = nullptr;
+                    co_yield Expression::generate(v_value_1, input[i], context);
+
+                    // Compute the value 2
+                    Register::ValuePtr v_value_2 = nullptr;
+                    co_yield Expression::generate(v_value_2, widenedInput[i], context);
+
+                    // Compute diff (value_1 - value_2)
+                    auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
+                    {
+                        auto boolType = resultVariableType(is_zero_diff).dataType;
+                        AssertFatal(
+                            boolType == DataType::Bool64, "is_zero type {}", toString(boolType));
+                    }
+
+                    // boolean_true = boolean_true | is_zero_diff
+                    auto accumRes = std::make_shared<Expression::Expression>(
+                        Expression::BitwiseAnd{boolean_true->expression(), is_zero_diff, "accum"});
+                    {
+                        auto accumType = resultVariableType(accumRes).dataType;
+                        AssertFatal(
+                            accumType == DataType::Bool64, "accum type {}", toString(accumType));
+                    }
+                    co_yield Expression::generate(boolean_true, accumRes, context);
+                }
+
+                auto v_value = Register::Value::Placeholder(
+                    context, Register::Type::Vector, DataType::UInt64, 1);
+                co_yield context->copier()->copy(v_value, boolean_true, "Move value");
+                co_yield context->mem()->storeGlobal(v_offset, v_value, 0, 8);
+            };
+        }
+
+        // For now, mainly for debugging, directly copy the two results values.
+        // workitemCount is passed as argument. Since it is 64-bit
+        // different global_store_dwordx2 format should be used.
+        auto kb_equal_one(ContextPtr                                      context,
+                          Expression::ExpressionPtr const&                input,
+                          Expression::ExpressionPtr const&                widenedInput,
+                          std::array<Expression::ExpressionPtr, 3> const& workitemCount,
+                          std::array<unsigned int, 3> const&              workgroupSize)
+        {
+            return [context, input, widenedInput, workitemCount, workgroupSize]()
+                       -> Generator<Instruction> {
+                // store base addr
+                Register::ValuePtr s_ptr;
+                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
+
+                Register::ValuePtr s_ptr2;
+                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
+
+                auto compare_res_pointer
+                    = get64BitVectorOffset(context, workitemCount, workgroupSize);
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset_1 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_1, compare_res_pointer + s_ptr->expression(), context);
+
+                Register::ValuePtr v_offset_2 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_2, compare_res_pointer + s_ptr2->expression(), context);
+
+                // Compute the value 1
+                Register::ValuePtr v_value_1 = nullptr;
+                co_yield Expression::generate(v_value_1, input, context);
+
+                // Compute the value 2
+                Register::ValuePtr v_value_2 = nullptr;
+                co_yield Expression::generate(v_value_2, widenedInput, context);
+
+                co_yield context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8);
+                co_yield context->mem()->storeGlobal(v_offset_2, v_value_2, 0, 8);
+            };
+        }
+
+        void setTensorArguments(CommandArguments&                  commandArgs,
+                                GEMMProblem const&                 problem,
+                                rocRollerTest::Graphs::GEMM const& gemm,
+                                CommandKernelPtr const&            commandKernelPtr) const
+        {
+            // calling setArgument - needed
+            TensorDescriptor descA(
+                gemm.m_ta, {size_t(problem.m), size_t(problem.k)}, problem.transA);
+            TensorDescriptor descB(
+                gemm.m_tb, {size_t(problem.k), size_t(problem.n)}, problem.transB);
+            TensorDescriptor descC(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N");
+            TensorDescriptor descD(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N");
+
+            // Note that actually large matrix hipMalloc is not needed.
+            // But at the same time, larger malloc may incurr larger base address, which can lead to overflow.
+            // ?? how to get float from m_dt? gemm.m_ta?
+            auto deviceA = make_shared_device<float /* came from DataType::Float for now.*/>();
+            auto deviceB = make_shared_device<float /* came from DataType::Float for now.*/>();
+            auto deviceC = make_shared_device<float /* came from DataType::Float for now.*/>();
+            auto deviceD = make_shared_device<float /* came from DataType::Float for now.*/>();
+
+            // Note that gemm built the CommandGraph, and store OperationTags. We are reusing
+            // that original Command, but getting away building KernelGraph and its lowering.
+            setCommandTensorArg(commandArgs, gemm.m_tagTensorA, descA, deviceA.get());
+            setCommandTensorArg(commandArgs, gemm.m_tagTensorB, descB, deviceB.get());
+            setCommandTensorArg(commandArgs, gemm.m_tagTensorC, descC, deviceC.get());
+            setCommandTensorArg(commandArgs, gemm.m_tagTensorD, descD, deviceD.get());
+
+            commandArgs.setArgument(gemm.m_tagScalarAlpha, ArgumentType::Value, problem.alpha);
+            commandArgs.setArgument(gemm.m_tagScalarBeta, ArgumentType::Value, problem.beta);
+            // seed doesn't seem to be relevant currently.
+            //if(seed.has_value())
+            //    commandArgs.setArgument(gemm.m_tagScalarSeed, ArgumentType::Value, seed.value());
+
+            // Create scratch space
+            if(problem.streamK)
+            {
+                commandArgs.setArgument(gemm.m_tagNumWGs, ArgumentType::Value, problem.numWGs);
+            }
+
+            // ?? Is it correct to use original commandKernel or addrTestCommandKernel
+            auto scratchSpaceRequired
+                = commandKernelPtr->scratchSpaceRequired(commandArgs.runtimeArguments());
+            auto deviceScratch = make_shared_device<uint8_t>(scratchSpaceRequired, 0);
+            commandArgs.setArgument(gemm.m_tagScratch, ArgumentType::Value, deviceScratch.get());
+        }
+
+        void generateOrigKernelByProlog()
+        {
+            m_commandKernel = std::make_shared<CommandKernel>(m_gemmGraph.getCommand(), "");
+            m_commandKernel->setContext(m_context);
+            m_commandKernel->setCommandParameters(m_gemmGraph.getCommandParameters());
+
+            Log::debug("lazyAddArguments: {}", m_context->kernelOptions().lazyAddArguments);
+
+            // Add extra kargs for testing
+            m_rvTag = m_commandKernel->getCommand()->allocateTag();
+            m_commandKernel->getCommand()->allocateArgument(
+                VariableType(DataType::UInt64, PointerType::PointerGlobal),
+                m_rvTag,
+                ArgumentType::Value,
+                DataDirection::WriteOnly,
+                "rv_ptr");
+
+            m_rvTag2 = m_commandKernel->getCommand()->allocateTag();
+            m_commandKernel->getCommand()->allocateArgument(
+                VariableType(DataType::UInt64, PointerType::PointerGlobal),
+                m_rvTag2,
+                ArgumentType::Value,
+                DataDirection::WriteOnly,
+                "rv_ptr2");
+
+            m_commandKernel->generateKernelGraphOnlyAfterTransforms();
+
+            auto k = m_context->kernel();
+            m_context->schedule(k->preamble());
+            {
+                { // Original Command Arguments
+                    auto commandArguments = m_commandKernel->getCommand()->getArguments();
+                    for(auto arg : commandArguments)
+                        Log::debug("Original Arg: {}", arg->toString());
+                }
+
+                { // workitemcount
+                    for(auto wit : k->workitemCount())
+                        Log::debug("Original Workitem expr: {}", toString(wit));
+                }
+            }
+
+            m_commandKernel->lowerToKernelArguments();
+            auto commandParameters = m_commandKernel->getCommandParameters();
+            auto workgroupSize     = commandParameters->getManualWorkgroupSize();
+            CHECK(workgroupSize.has_value());
+
+            { // workitemcount
+                for(auto wit : k->workitemCount())
+                    Log::debug("== Original Workitem expr: {}", toString(wit));
+            }
+
+            m_context->schedule(k->prolog());
+        }
+
+        void launchKernelAndCopyBackToHost()
+        {
+            // Setting workitem counts before launching
+            // Notice that addrTestCommandKernel's generateKernel() won't be called.
+            auto launch = std::make_shared<CommandLaunchParameters>();
+
+            // See the comments down there. Computation of workitemCount is redundant.
+            // "totalWorkitemCounts" is for debugging or logging.
+            std::tie(m_workitemCount, m_totalWorkitemCount)
+                = getWorkItemCount(m_commandKernel->getCommandParameters(), m_problem);
+
+            // Notice that without following line of "setManualWorkitemCount".
+            //    launch->setManualWorkitemCount(workitemCount);
+            // - workitemcounts were computed correctly
+            // within the workitemcount_kernelbody's execution. That, I think, is because
+            // The original graph already had workitemcount expressions as a function of
+            // input tensor sizes, one of the commandArguments. (e.g. Tensor_0_size_0_8)
+            // Thus, only the command arguments are set, workitemcounts can be computed.
+            // In ohter words, this manual computation of workitemcounts and setting by
+            // setManualWorkitemCount is not needed. Still, "getWorkItemCount" is kept
+            // for debugging purposes. In order to allocate device and host memory for storing
+            // results of kernel's execution, we still need a concrete number of allocation sizes.
+
+            m_commandKernel->setLaunchParameters(launch);
+
+            // What is the right check here for GPU??
+            //if (isLocalDevice()) {
+            // addrTestCommandKernel reused original Command, for which OperationTags are stored
+            // in gemm, a GEMM object.
+            CommandArguments commandArgs = m_commandKernel->getCommand()->createArguments();
+            setTensorArguments(commandArgs, m_problem, m_gemmGraph, m_commandKernel);
+
+            auto outputSize = m_totalWorkitemCount;
+            auto rvPointer  = make_shared_device<uint64_t>(outputSize, 0);
+            commandArgs.setArgument(m_rvTag, ArgumentType::Value, rvPointer.get());
+
+            auto rvPointer2 = make_shared_device<uint64_t>(outputSize, 0);
+            commandArgs.setArgument(m_rvTag2, ArgumentType::Value, rvPointer2.get());
+
+            m_commandKernel->launchKernel(commandArgs.runtimeArguments());
+
+            m_hostBuffer.resize(outputSize, 10);
+            CHECK_THAT(hipMemcpy(m_hostBuffer.data(),
+                                 rvPointer.get(),
+                                 sizeof(uint64_t) * outputSize,
+                                 hipMemcpyDefault),
+                       HasHipSuccess(0));
+
+            m_hostBuffer2.resize(outputSize, 20);
+            CHECK_THAT(hipMemcpy(m_hostBuffer2.data(),
+                                 rvPointer2.get(),
+                                 sizeof(uint64_t) * outputSize,
+                                 hipMemcpyDefault),
+                       HasHipSuccess(0));
+
+            // check hostBuffer's value
+            Log::debug("outputSize: {}", outputSize);
+        }
+
+        void test_implicit_workitemcount()
+        {
+            generateOrigKernelByProlog();
+
+            auto k = m_context->kernel();
+            m_context->schedule(kb_implicit_workitemcount(
+                m_context,
+                k->workitemCount()[0],
+                k->workitemCount()[1],
+                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+
+            m_context->schedule(k->postamble());
+            m_context->schedule(k->amdgpu_metadata());
+
+            launchKernelAndCopyBackToHost();
+
+            // Remove ":" and subsequent parts to extract only leading literals.
+            auto const& host_x_string = toString(m_workitemCount[0]);
+            size_t      del1          = host_x_string.find_first_of(":");
+            auto const& host_x        = host_x_string.substr(0, del1);
+
+            auto const& host_y_string = toString(m_workitemCount[1]);
+            del1                      = host_y_string.find_first_of(":");
+            auto const& host_y        = host_y_string.substr(0, del1);
+
+            for(int i = 0, size = m_hostBuffer.size(); i < m_totalWorkitemCount; i++)
+            {
+                // For 128 by 128 output matrix, workgroupCount computed is {512, 2, 1}
+                if(toString(m_hostBuffer[i]) != host_x || toString(m_hostBuffer2[i]) != host_y)
+                {
+                    std::cout
+                        << "workitemCount.x and workitemCount.y in kernel for global workitem " << i
+                        << ": " << m_hostBuffer[i] << ", " << m_hostBuffer2[i] << "\n";
+                    std::cout << "workitemCount.x and workitemCount.y in host for global workitem "
+                              << i << ": " << host_x << ", " << host_y << "\n";
+                }
+                CHECK(toString(m_hostBuffer[i]) == host_x);
+                CHECK(toString(m_hostBuffer2[i]) == host_y);
+            }
+        }
+
+        void test_sanity_indices()
+        {
+            generateOrigKernelByProlog();
+
+            auto k = m_context->kernel();
+
+            m_context->schedule(kb_sanity_indices(
+                m_context,
+                k->workitemCount(),
+                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+
+            m_context->schedule(k->postamble());
+            m_context->schedule(k->amdgpu_metadata());
+
+            launchKernelAndCopyBackToHost();
+
+            for(int i = 0; i < m_totalWorkitemCount; i++)
+                Log::debug("wgx {} thx {}", m_hostBuffer[i], m_hostBuffer2[i]);
+
+            // Remove ":" and subsequent parts to extract only leading literals.
+            auto const& host_x_string   = toString(m_workitemCount[0]);
+            size_t      del1            = host_x_string.find_first_of(":");
+            auto const& host_x          = host_x_string.substr(0, del1);
+            auto        workitemcount_x = std::stoi(host_x);
+
+            auto const& host_y_string   = toString(m_workitemCount[1]);
+            del1                        = host_y_string.find_first_of(":");
+            auto const& host_y          = host_y_string.substr(0, del1);
+            auto        workitemcount_y = std::stoi(host_y);
+
+            Log::debug("workitemcount_x {} workitemcount_y {}", workitemcount_x, workitemcount_y);
+
+            auto workgroupsize_x
+                = ((m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())[0];
+            for(int rows = 0, numRows = workitemcount_y; rows < numRows; rows++)
+            {
+                auto blockIdx_y = rows % workgroupsize_x;
+                auto base       = rows * workitemcount_x;
+                for(int cols = 0, numCols = workitemcount_x; cols < numCols; cols++)
+                {
+                    auto blockIdx_x  = cols / workgroupsize_x;
+                    auto threadIdx_x = cols % workgroupsize_x;
+
+                    auto linearIdx = cols + base;
+
+                    CHECK(m_hostBuffer[linearIdx] == blockIdx_x);
+                    CHECK(m_hostBuffer2[linearIdx] == threadIdx_x);
+                }
+            }
+        }
+
+        void test_equal_one()
+        {
+            generateOrigKernelByProlog();
+
+            // Get the expression to compute
+            std::vector<Expression::ExpressionPtr> indexExprPtrs;
+            indexExprPtrs = AddressTrace(m_commandKernel->getKernelGraph(), m_context)
+                                .traceComputeIndexWithBuffer();
+            std::vector<Expression::ExpressionPtr> widenedExprPtrs;
+            for(int i = 0, size = indexExprPtrs.size(); i < size; i++)
+            {
+                auto eptr = indexExprPtrs[i];
+                Log::debug("== Expr : {} ", toString(eptr));
+
+                widenedExprPtrs.push_back(Expression::widenTo64bit(eptr));
+                Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
+            }
+
+            auto k = m_context->kernel();
+            m_context->schedule(kb_equal_one(
+                m_context,
+                indexExprPtrs[0],
+                widenedExprPtrs[0],
+                k->workitemCount(),
+                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+
+            m_context->schedule(k->postamble());
+            m_context->schedule(k->amdgpu_metadata());
+
+            launchKernelAndCopyBackToHost();
+
+            // for test_kernelbody
+            for(int i = 0, size = m_hostBuffer.size(); i < size; i++)
+            {
+                if(m_hostBuffer[i] != m_hostBuffer2[i])
+                {
+                    Log::debug("diff at {}: {} {}", i, m_hostBuffer[i], m_hostBuffer2[i]);
+                }
+                CHECK(m_hostBuffer[i] == m_hostBuffer2[i]);
+            }
+        }
+
+        void test_equal()
+        {
+            generateOrigKernelByProlog();
+
+            // Get the expression to compute
+            std::vector<Expression::ExpressionPtr> indexExprPtrs;
+            indexExprPtrs = AddressTrace(m_commandKernel->getKernelGraph(), m_context)
+                                .traceComputeIndexWithBuffer();
+            std::vector<Expression::ExpressionPtr> widenedExprPtrs;
+            for(int i = 0, size = indexExprPtrs.size(); i < size; i++)
+            {
+                auto eptr = indexExprPtrs[i];
+                Log::debug("== Expr : {} ", toString(eptr));
+
+                widenedExprPtrs.push_back(Expression::widenTo64bit(eptr));
+                Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
+            }
+
+            m_context->schedule(kb_equal(
+                m_context,
+                indexExprPtrs,
+                widenedExprPtrs,
+                m_context->kernel()->workitemCount(),
+                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+
+            auto k = m_context->kernel();
+            m_context->schedule(k->postamble());
+            m_context->schedule(k->amdgpu_metadata());
+
+            launchKernelAndCopyBackToHost();
+
+            for(int i = 0, size = m_hostBuffer.size(); i < size; i++)
+            {
+                if(m_hostBuffer[i] != 0xFFFFFFFF)
+                {
+                    std::cout << "The addresses are not same at " << i << " " << m_hostBuffer[i]
+                              << "\n";
+                }
+                CHECK(m_hostBuffer[i] == 0xFFFFFFFF);
+            }
+
+            // hipFree will be taken care of by make_shared_device
+        }
+
+    private:
+        ContextPtr                  m_context;
+        CommandKernelPtr            m_commandKernel;
+        GEMMProblem const&          m_problem;
+        rocRollerTest::Graphs::GEMM m_gemmGraph;
+
+        rocRoller::Operations::OperationTag m_rvTag;
+        rocRoller::Operations::OperationTag m_rvTag2;
+
+        // For checking results on host-side
+        std::array<Expression::ExpressionPtr, 3> m_workitemCount;
+        uint                                     m_totalWorkitemCount;
+        std::vector<uint64_t>                    m_hostBuffer;
+        std::vector<uint64_t>                    m_hostBuffer2;
+    };
+
+    TEST_CASE("address calculation test", "[expression][gpu][equal][float][128_128]")
+    {
+        auto context = TestContext::ForTestDevice();
+
+        GEMMProblem                 problem{.m = 128, .n = 128};
+        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
+        gemm.setProblem(problem);
+
+        AddressCalculationTest kernel(context.get(), problem, gemm);
+        kernel.test_equal();
+    }
+
+    TEST_CASE("address calculation test", "[expression][gpu][equal][float][512_512]")
+    {
+        auto context = TestContext::ForTestDevice();
+
+        GEMMProblem                 problem{.m = 512, .n = 512};
+        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
+        gemm.setProblem(problem);
+
+        AddressCalculationTest kernel(context.get(), problem, gemm);
+        kernel.test_equal();
+    }
+
+    TEST_CASE("address calculation test", "[expression][gpu][equal_one][float][128_128]")
+    {
+        auto context = TestContext::ForTestDevice({}, "equal_one_128x128");
+
+        GEMMProblem                 problem{.m = 128, .n = 128};
+        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
+        gemm.setProblem(problem);
+
+        AddressCalculationTest kernel(context.get(), problem, gemm);
+        kernel.test_equal_one();
+    }
+
+    TEST_CASE("address calculation test", "[expression][gpu][implicit_workitemcount]")
+    {
+        auto context = TestContext::ForTestDevice({}, "impl_workitemcnt");
+
+        GEMMProblem                 problem{.m = 128, .n = 128};
+        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
+        gemm.setProblem(problem);
+
+        AddressCalculationTest kernel(context.get(), problem, gemm);
+        kernel.test_implicit_workitemcount();
+    }
+
+    TEST_CASE("address calculation test", "[expression][gpu][sanity_indices]")
+    {
+        auto context = TestContext::ForTestDevice({}, "sanity_indices");
+
+        GEMMProblem                 problem{.m = 128, .n = 128};
+        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
+        gemm.setProblem(problem);
+
+        AddressCalculationTest kernel(context.get(), problem, gemm);
+        kernel.test_sanity_indices();
+    }
+}

From 33c9d1341f0c35c944083eaee48b65976f04aebf Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Wed, 5 Mar 2025 01:49:42 +0000
Subject: [PATCH 07/32] Move the location of test utility from
 ExpressionTransform

---
 lib/CMakeLists.txt                            |   1 -
 .../rocRoller/ExpressionTransformations.hpp   |  10 -
 .../WidenTo64bit.cpp                          | 262 -----------------
 test/CMakeLists.txt                           |   6 +-
 test/catch/AddressCalculationTest.cpp         |  10 +-
 test/common/WidenTo64bit.cpp                  | 264 ++++++++++++++++++
 6 files changed, 272 insertions(+), 281 deletions(-)
 delete mode 100644 lib/source/ExpressionTransformations/WidenTo64bit.cpp
 create mode 100644 test/common/WidenTo64bit.cpp

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 77e2dbe9..805b7c86 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -75,7 +75,6 @@ set(rocroller_src
     source/ExpressionTransformations/LowerPRNG.cpp
     source/ExpressionTransformations/RestoreCommandArguments.cpp
     source/ExpressionTransformations/Simplify.cpp
-    source/ExpressionTransformations/WidenTo64bit.cpp
 
     source/CodeGen/AddInstruction.cpp
     source/CodeGen/ArgumentLoader.cpp
diff --git a/lib/include/rocRoller/ExpressionTransformations.hpp b/lib/include/rocRoller/ExpressionTransformations.hpp
index 55a3a07f..e4a730ec 100644
--- a/lib/include/rocRoller/ExpressionTransformations.hpp
+++ b/lib/include/rocRoller/ExpressionTransformations.hpp
@@ -135,15 +135,5 @@ namespace rocRoller
          * @return ExpressionPtr Transformed expression
          */
         ExpressionPtr lowerBitfieldValues(ExpressionPtr expr);
-
-        /**
-         * @brief Widen (u)int32 to (u)int64.
-         *
-         * Has many assumptions in input expr. See the implementation for details.
-         *
-         * @param expr Input expression
-         * @return ExpressionPtr Transformed expression
-         */
-        ExpressionPtr widenTo64bit(ExpressionPtr expr);
     }
 }
diff --git a/lib/source/ExpressionTransformations/WidenTo64bit.cpp b/lib/source/ExpressionTransformations/WidenTo64bit.cpp
deleted file mode 100644
index f43b07d4..00000000
--- a/lib/source/ExpressionTransformations/WidenTo64bit.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-#include <rocRoller/AssemblyKernelArgument.hpp>
-#include <rocRoller/Expression.hpp>
-#include <rocRoller/ExpressionTransformations.hpp>
-
-template <typename T>
-constexpr auto cast_to_unsigned(T val)
-{
-    return static_cast<typename std::make_unsigned<T>::type>(val);
-}
-
-namespace rocRoller
-{
-    namespace Expression
-    {
-        struct WidenTo64BitVisitor
-        {
-
-            ExpressionPtr operator()(Convert const& expr) const
-            {
-                Convert cpy = expr;
-                if(expr.arg)
-                {
-                    // Here is an assumption that call(cpy.arg) never goes above 64-bit and
-                    // input convert's destination types are either int32, uint32, int64 or uint64.
-                    // resultVaribleType(expr.arg) is not called intentionally as it will visit the
-                    // subtree of expr.arg again.
-                    // We can also import similar logic from ExpressionResultTypeVisitor and
-                    // make the operator()(...) return a pair of ExpressionPtr and VariableType
-                    // in order to avoid repeated visit of the subtree of cpy.arg.
-                    cpy.arg = call(expr.arg);
-                    if(expr.destinationType == DataType::UInt32)
-                        return convert(DataType::UInt64, cpy.arg);
-                    else if(expr.destinationType == DataType::Int32)
-                        return convert(DataType::Int64, cpy.arg);
-                    else if(expr.destinationType == DataType::UInt64
-                            || expr.destinationType == DataType::Int64)
-                        return convert(expr.destinationType, cpy.arg);
-                    else
-                    {
-                        AssertFatal(
-                            false,
-                            "Expected Destination type for a convert is either (u)int{32,64}");
-                        return nullptr;
-                    }
-                }
-
-                return std::make_shared<Expression>(cpy);
-            }
-
-            ExpressionPtr operator()(Negate const& expr) const
-            {
-                Negate cpy = expr;
-                if(expr.arg)
-                {
-                    cpy.arg = call(expr.arg);
-                }
-
-                return std::make_shared<Expression>(cpy);
-            }
-
-            template <CUnary Expr>
-            ExpressionPtr operator()(Expr const& expr) const
-            {
-                // Only Convert and perhaps negates are expected.
-                AssertFatal(false, "Unexpected Unary expression", ShowValue(expr));
-                return nullptr;
-            }
-
-            ExpressionPtr operator()(Divide const& expr) const
-            {
-                // TODO: Add check that divisor is a CValue within (u)int32-bit
-                Log::debug("Divisor: {} ", toString(expr.rhs));
-
-                Divide cpy = expr;
-                if(expr.lhs)
-                    cpy.lhs = call(expr.lhs);
-
-                return std::make_shared<Expression>(cpy);
-            }
-
-            ExpressionPtr operator()(Modulo const& expr) const
-            {
-                // TODO: Add check that divisor is a CValue within (u)int32-bit
-                Log::debug("Modulo: {} ", toString(expr.rhs));
-
-                Modulo cpy = expr;
-                if(expr.lhs)
-                    cpy.lhs = call(expr.lhs);
-
-                return std::make_shared<Expression>(cpy);
-            }
-
-            // TODO: for some shifts, logical, subtract operations these may not be correct.
-            //       Not sure yet if address calculation expressions with identity_transducer
-            //       those types of Expression or not.
-            template <CBinary Expr>
-            requires(CArithmetic<Expr>) ExpressionPtr operator()(Expr const& expr) const
-            {
-                if constexpr(std::same_as<Expr, Subtract>)
-                {
-                    AssertFatal(false, "Subtracts are not expected");
-                    return nullptr;
-                }
-
-                if constexpr(CLogical<Expr>)
-                {
-                    AssertFatal(false, "logicals are not expected");
-                    return nullptr;
-                }
-
-                Expr cpy = expr;
-                if(expr.lhs)
-                    cpy.lhs = call(expr.lhs);
-                if(expr.rhs)
-                    cpy.rhs = call(expr.rhs);
-
-                return std::make_shared<Expression>(cpy);
-            }
-
-            // Even with catch-all operator()(ExpressionPtr) without following,
-            // compilation fails.
-            template <typename Expr>
-            requires(CBinary<Expr>) ExpressionPtr operator()(Expr const& expr) const
-            {
-                AssertFatal(false, "Not expected expr : ", ShowValue(expr));
-                return nullptr;
-            }
-
-            template <CTernary Expr>
-            requires(CArithmetic<Expr>) ExpressionPtr operator()(Expr const& expr) const
-            {
-                Expr cpy = expr;
-                if(expr.lhs)
-                    cpy.lhs = call(expr.lhs);
-                if(expr.r1hs)
-                    cpy.r1hs = call(expr.r1hs);
-                if(expr.r2hs)
-                    cpy.r2hs = call(expr.r2hs);
-
-                return std::make_shared<Expression>(cpy);
-            }
-
-            template <typename Expr>
-            requires(CTernary<Expr>) ExpressionPtr operator()(Expr const& expr) const
-            {
-                AssertFatal(false, "Not expected expr : ", ShowValue(expr));
-                return nullptr;
-            }
-
-            // leaves
-            ExpressionPtr operator()(CommandArgumentPtr const& expr) const
-            {
-                Log::debug("CommandArgumentPtr {}", toString(expr));
-
-                auto varType = expr->variableType();
-
-                assertIfNotExpectedType(varType.dataType, toString(expr));
-
-                CommandArgumentPtr cpy = expr;
-                return widenTo64(varType.dataType, cpy);
-            }
-
-            ExpressionPtr operator()(CommandArgumentValue const& expr) const
-            {
-                Log::debug("CommandArgumentValue {} Type {} ",
-                           toString(expr),
-                           toString(variableType(expr)));
-
-                auto varType = variableType(expr);
-
-                assertIfNotExpectedType(varType.dataType, toString(expr));
-
-                CommandArgumentValue cpy = expr;
-                return widenTo64(varType.dataType, cpy);
-            }
-
-            ExpressionPtr operator()(Register::ValuePtr const& expr) const
-            {
-                Log::debug("Register::ValuePtr {}", toString(expr));
-
-                auto varType = expr->variableType();
-
-                assertIfNotExpectedType(varType.dataType, toString(expr));
-
-                Register::ValuePtr cpy = expr;
-                return widenTo64(varType.dataType, cpy);
-            }
-
-            ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const
-            {
-                Log::debug("AssemblyKernelArgumentPtr {} its expression is {}",
-                           toString(expr),
-                           toString(expr->expression));
-
-                auto varType = expr->variableType;
-
-                assertIfNotExpectedType(varType.dataType, toString(expr));
-
-                AssemblyKernelArgumentPtr cpy = expr;
-                return widenTo64(varType.dataType, cpy);
-            }
-
-            // catch the rest CValue
-            template <CValue Value>
-            ExpressionPtr operator()(Value const& expr) const
-            {
-                AssertFatal(
-                    false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr));
-                return nullptr;
-            }
-
-            ExpressionPtr operator()(Expression const& expr) const
-            {
-                AssertFatal(
-                    false, "No expectation to meet this type of Expression: ", ShowValue(expr));
-                return nullptr;
-            }
-
-            template <typename T>
-            ExpressionPtr widenTo64(DataType srcType, T const& expr) const
-            {
-                if(srcType == DataType::UInt32)
-                    return convert(DataType::UInt64, std::make_shared<Expression>(expr));
-                else if(srcType == DataType::Int32)
-                    return convert(DataType::Int64, std::make_shared<Expression>(expr));
-
-                return std::make_shared<Expression>(expr);
-            }
-
-            ExpressionPtr call(ExpressionPtr const& expr) const
-            {
-                return std::visit(*this, *expr);
-            }
-
-            void assertIfNotExpectedType(DataType dt, std::string const& showValue) const
-            {
-                AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32
-                                || dt == DataType::UInt64 || dt == DataType::Int64,
-                            "Unexpected DataType for Command/Kernel arguments or "
-                            "workgroup/item indices ",
-                            showValue);
-            }
-        };
-
-        ExpressionPtr widenTo64bit(ExpressionPtr expr)
-        {
-            auto origVarType = resultVariableType(expr);
-
-            auto visitor = WidenTo64BitVisitor();
-            auto widened = visitor.call(expr);
-
-            auto finalVarType = resultVariableType(widened);
-
-            AssertFatal(origVarType.dataType == finalVarType.dataType,
-                        "Original and final data types should be the same",
-                        ShowValue(origVarType.dataType),
-                        ShowValue(finalVarType.dataType));
-
-            return widened;
-        }
-    }
-}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 2caf0905..e9901587 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -28,7 +28,11 @@ target_include_directories(
 )
 
 add_library(
-    common_test_utilities OBJECT common/CommonGraphs.cpp common/Utilities.cpp common/mxDataGen.cpp
+	common_test_utilities OBJECT 
+	common/CommonGraphs.cpp 
+	common/Utilities.cpp 
+	common/mxDataGen.cpp 
+	common/WidenTo64bit.cpp
 )
 
 target_compile_options(
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index c60e6494..f72c4766 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -4,10 +4,10 @@
 #include <common/CommonGraphs.hpp>
 #include <common/GEMMProblem.hpp>
 #include <common/Utilities.hpp>
+#include <common/WidenTo64bit.hpp>
 #include <rocRoller/CodeGen/ArgumentLoader.hpp>
 #include <rocRoller/CodeGen/MemoryInstructions.hpp>
 #include <rocRoller/CodeGen/WaitCount.hpp>
-#include <rocRoller/ExpressionTransformations.hpp>
 #include <rocRoller/KernelGraph/CoordinateGraph/Transformer.hpp>
 #include <rocRoller/KernelGraph/KernelGraph.hpp>
 #include <rocRoller/KernelGraph/Utils.hpp>
@@ -610,10 +610,6 @@ namespace AddressCalculationTest
 
             m_commandKernel->setLaunchParameters(launch);
 
-            // What is the right check here for GPU??
-            //if (isLocalDevice()) {
-            // addrTestCommandKernel reused original Command, for which OperationTags are stored
-            // in gemm, a GEMM object.
             CommandArguments commandArgs = m_commandKernel->getCommand()->createArguments();
             setTensorArguments(commandArgs, m_problem, m_gemmGraph, m_commandKernel);
 
@@ -750,7 +746,7 @@ namespace AddressCalculationTest
                 auto eptr = indexExprPtrs[i];
                 Log::debug("== Expr : {} ", toString(eptr));
 
-                widenedExprPtrs.push_back(Expression::widenTo64bit(eptr));
+                widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr));
                 Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
             }
 
@@ -792,7 +788,7 @@ namespace AddressCalculationTest
                 auto eptr = indexExprPtrs[i];
                 Log::debug("== Expr : {} ", toString(eptr));
 
-                widenedExprPtrs.push_back(Expression::widenTo64bit(eptr));
+                widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr));
                 Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
             }
 
diff --git a/test/common/WidenTo64bit.cpp b/test/common/WidenTo64bit.cpp
new file mode 100644
index 00000000..7758628c
--- /dev/null
+++ b/test/common/WidenTo64bit.cpp
@@ -0,0 +1,264 @@
+#include <common/WidenTo64bit.hpp>
+#include <rocRoller/AssemblyKernelArgument.hpp>
+#include <rocRoller/Expression.hpp>
+
+template <typename T>
+constexpr auto cast_to_unsigned(T val)
+{
+    return static_cast<typename std::make_unsigned<T>::type>(val);
+}
+
+namespace rocRollerTest
+{
+
+    using namespace rocRoller;
+
+    struct WidenTo64BitVisitor
+    {
+
+        Expression::ExpressionPtr operator()(Expression::Convert const& expr) const
+        {
+            Expression::Convert cpy = expr;
+            if(expr.arg)
+            {
+                // Here is an assumption that call(cpy.arg) never goes above 64-bit and
+                // input convert's destination types are either int32, uint32, int64 or uint64.
+                // resultVaribleType(expr.arg) is not called intentionally as it will visit the
+                // subtree of expr.arg again.
+                // We can also import similar logic from ExpressionResultTypeVisitor and
+                // make the operator()(...) return a pair of Expression::ExpressionPtr and VariableType
+                // in order to avoid repeated visit of the subtree of cpy.arg.
+                cpy.arg = call(expr.arg);
+                if(expr.destinationType == DataType::UInt32)
+                    return convert(DataType::UInt64, cpy.arg);
+                else if(expr.destinationType == DataType::Int32)
+                    return convert(DataType::Int64, cpy.arg);
+                else if(expr.destinationType == DataType::UInt64
+                        || expr.destinationType == DataType::Int64)
+                    return convert(expr.destinationType, cpy.arg);
+                else
+                {
+                    AssertFatal(false,
+                                "Expected Destination type for a convert is either (u)int{32,64}");
+                    return nullptr;
+                }
+            }
+
+            return std::make_shared<Expression::Expression>(cpy);
+        }
+
+        Expression::ExpressionPtr operator()(Expression::Negate const& expr) const
+        {
+            Expression::Negate cpy = expr;
+            if(expr.arg)
+            {
+                cpy.arg = call(expr.arg);
+            }
+
+            return std::make_shared<Expression::Expression>(cpy);
+        }
+
+        template <Expression::CUnary Expr>
+        Expression::ExpressionPtr operator()(Expr const& expr) const
+        {
+            // Only Expression::Convert and perhaps negates are expected.
+            AssertFatal(false, "Unexpected Unary expression", ShowValue(expr));
+            return nullptr;
+        }
+
+        Expression::ExpressionPtr operator()(Expression::Divide const& expr) const
+        {
+            // TODO: Add check that divisor is a CValue within (u)int32-bit
+            Log::debug("Divisor: {} ", toString(expr.rhs));
+
+            Expression::Divide cpy = expr;
+            if(expr.lhs)
+                cpy.lhs = call(expr.lhs);
+
+            return std::make_shared<Expression::Expression>(cpy);
+        }
+
+        Expression::ExpressionPtr operator()(Expression::Modulo const& expr) const
+        {
+            // TODO: Add check that divisor is a CValue within (u)int32-bit
+            Log::debug("Modulo: {} ", toString(expr.rhs));
+
+            Expression::Modulo cpy = expr;
+            if(expr.lhs)
+                cpy.lhs = call(expr.lhs);
+
+            return std::make_shared<Expression::Expression>(cpy);
+        }
+
+        // TODO: for some shifts, logical, subtract operations these may not be correct.
+        //       Not sure yet if address calculation expressions with identity_transducer
+        //       those types of Expression or not.
+        template <Expression::CBinary Expr>
+        requires(Expression::CArithmetic<Expr>) Expression::ExpressionPtr
+            operator()(Expr const& expr) const
+        {
+            if constexpr(std::same_as<Expr, Expression::Subtract>)
+            {
+                AssertFatal(false, "Subtracts are not expected");
+                return nullptr;
+            }
+
+            if constexpr(Expression::CLogical<Expr>)
+            {
+                AssertFatal(false, "logicals are not expected");
+                return nullptr;
+            }
+
+            Expr cpy = expr;
+            if(expr.lhs)
+                cpy.lhs = call(expr.lhs);
+            if(expr.rhs)
+                cpy.rhs = call(expr.rhs);
+
+            return std::make_shared<Expression::Expression>(cpy);
+        }
+
+        // Even with catch-all operator()(Expression::ExpressionPtr) without following,
+        // compilation fails.
+        template <typename Expr>
+        requires(Expression::CBinary<Expr>) Expression::ExpressionPtr
+            operator()(Expr const& expr) const
+        {
+            AssertFatal(false, "Not expected expr : ", ShowValue(expr));
+            return nullptr;
+        }
+
+        template <Expression::CTernary Expr>
+        requires(Expression::CArithmetic<Expr>) Expression::ExpressionPtr
+            operator()(Expr const& expr) const
+        {
+            Expr cpy = expr;
+            if(expr.lhs)
+                cpy.lhs = call(expr.lhs);
+            if(expr.r1hs)
+                cpy.r1hs = call(expr.r1hs);
+            if(expr.r2hs)
+                cpy.r2hs = call(expr.r2hs);
+
+            return std::make_shared<Expression::Expression>(cpy);
+        }
+
+        template <typename Expr>
+        requires(Expression::CTernary<Expr>) Expression::ExpressionPtr
+            operator()(Expr const& expr) const
+        {
+            AssertFatal(false, "Not expected expr : ", ShowValue(expr));
+            return nullptr;
+        }
+
+        // leaves
+        Expression::ExpressionPtr operator()(CommandArgumentPtr const& expr) const
+        {
+            Log::debug("CommandArgumentPtr {}", Expression::toString(expr));
+
+            auto varType = expr->variableType();
+
+            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
+
+            CommandArgumentPtr cpy = expr;
+            return widenTo64(varType.dataType, cpy);
+        }
+
+        Expression::ExpressionPtr operator()(CommandArgumentValue const& expr) const
+        {
+            Log::debug("CommandArgumentValue {} Type {} ",
+                       Expression::toString(expr),
+                       toString(variableType(expr)));
+
+            auto varType = variableType(expr);
+
+            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
+
+            CommandArgumentValue cpy = expr;
+            return widenTo64(varType.dataType, cpy);
+        }
+
+        Expression::ExpressionPtr operator()(Register::ValuePtr const& expr) const
+        {
+            Log::debug("Register::ValuePtr {}", Expression::toString(expr));
+
+            auto varType = expr->variableType();
+
+            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
+
+            Register::ValuePtr cpy = expr;
+            return widenTo64(varType.dataType, cpy);
+        }
+
+        Expression::ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const
+        {
+            Log::debug("AssemblyKernelArgumentPtr {} its expression is {}",
+                       Expression::toString(expr),
+                       Expression::toString(expr->expression));
+
+            auto varType = expr->variableType;
+
+            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
+
+            AssemblyKernelArgumentPtr cpy = expr;
+            return widenTo64(varType.dataType, cpy);
+        }
+
+        // catch the rest CValue
+        template <Expression::CValue Value>
+        Expression::ExpressionPtr operator()(Value const& expr) const
+        {
+            AssertFatal(
+                false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr));
+            return nullptr;
+        }
+
+        Expression::ExpressionPtr operator()(Expression::Expression const& expr) const
+        {
+            AssertFatal(false, "No expectation to meet this type of Expression: ", ShowValue(expr));
+            return nullptr;
+        }
+
+        template <typename T>
+        Expression::ExpressionPtr widenTo64(DataType srcType, T const& expr) const
+        {
+            if(srcType == DataType::UInt32)
+                return convert(DataType::UInt64, std::make_shared<Expression::Expression>(expr));
+            else if(srcType == DataType::Int32)
+                return convert(DataType::Int64, std::make_shared<Expression::Expression>(expr));
+
+            return std::make_shared<Expression::Expression>(expr);
+        }
+
+        Expression::ExpressionPtr call(Expression::ExpressionPtr const& expr) const
+        {
+            return std::visit(*this, *expr);
+        }
+
+        void assertIfNotExpectedType(DataType dt, std::string const& showValue) const
+        {
+            AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32 || dt == DataType::UInt64
+                            || dt == DataType::Int64,
+                        "Unexpected DataType for Command/Kernel arguments or "
+                        "workgroup/item indices ",
+                        showValue);
+        }
+    };
+
+    Expression::ExpressionPtr widenTo64bit(Expression::ExpressionPtr expr)
+    {
+        auto origVarType = resultVariableType(expr);
+
+        auto visitor = WidenTo64BitVisitor();
+        auto widened = visitor.call(expr);
+
+        auto finalVarType = resultVariableType(widened);
+
+        AssertFatal(origVarType.dataType == finalVarType.dataType,
+                    "Original and final data types should be the same",
+                    ShowValue(origVarType.dataType),
+                    ShowValue(finalVarType.dataType));
+
+        return widened;
+    }
+}

From 05f9cc5a929e0b7ce8676c6cbee0ceb3a2b69a21 Mon Sep 17 00:00:00 2001
From: Choi <yoonchoi@ctr2-alola-login-01.amd.com>
Date: Wed, 5 Mar 2025 13:55:43 -0500
Subject: [PATCH 08/32] Adding missing header file

---
 test/common/common/WidenTo64bit.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 test/common/common/WidenTo64bit.hpp

diff --git a/test/common/common/WidenTo64bit.hpp b/test/common/common/WidenTo64bit.hpp
new file mode 100644
index 00000000..33a84085
--- /dev/null
+++ b/test/common/common/WidenTo64bit.hpp
@@ -0,0 +1,12 @@
+#include <rocRoller/Expression.hpp>
+namespace rocRollerTest {
+/**
+         * @brief Widen (u)int32 to (u)int64.
+         *
+         * Has many assumptions in input expr. See the implementation for details.
+         *
+         * @param expr Input expression
+         * @return ExpressionPtr Transformed expression
+         */
+	rocRoller::Expression::ExpressionPtr widenTo64bit(rocRoller::Expression::ExpressionPtr expr);
+}

From 002aa302a787aaf8af381bca5b4f779fbfae0084 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Thu, 6 Mar 2025 23:19:44 +0000
Subject: [PATCH 09/32] Renaming of member variables to remove CodeQL errors

---
 test/catch/AddressCalculationTest.cpp | 24 ++++++-------
 test/common/CommonGraphs.cpp          | 50 +++++++++++++--------------
 test/common/common/CommonGraphs.hpp   |  8 ++---
 3 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index f72c4766..355260e6 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -490,11 +490,11 @@ namespace AddressCalculationTest
         {
             // calling setArgument - needed
             TensorDescriptor descA(
-                gemm.m_ta, {size_t(problem.m), size_t(problem.k)}, problem.transA);
+                gemm.mTa, {size_t(problem.m), size_t(problem.k)}, problem.transA);
             TensorDescriptor descB(
-                gemm.m_tb, {size_t(problem.k), size_t(problem.n)}, problem.transB);
-            TensorDescriptor descC(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N");
-            TensorDescriptor descD(gemm.m_td, {size_t(problem.m), size_t(problem.n)}, "N");
+                gemm.mTb, {size_t(problem.k), size_t(problem.n)}, problem.transB);
+            TensorDescriptor descC(gemm.mTd, {size_t(problem.m), size_t(problem.n)}, "N");
+            TensorDescriptor descD(gemm.mTd, {size_t(problem.m), size_t(problem.n)}, "N");
 
             // Note that actually large matrix hipMalloc is not needed.
             // But at the same time, larger malloc may incurr larger base address, which can lead to overflow.
@@ -506,13 +506,13 @@ namespace AddressCalculationTest
 
             // Note that gemm built the CommandGraph, and store OperationTags. We are reusing
             // that original Command, but getting away building KernelGraph and its lowering.
-            setCommandTensorArg(commandArgs, gemm.m_tagTensorA, descA, deviceA.get());
-            setCommandTensorArg(commandArgs, gemm.m_tagTensorB, descB, deviceB.get());
-            setCommandTensorArg(commandArgs, gemm.m_tagTensorC, descC, deviceC.get());
-            setCommandTensorArg(commandArgs, gemm.m_tagTensorD, descD, deviceD.get());
+            setCommandTensorArg(commandArgs, gemm.mTagTensorA, descA, deviceA.get());
+            setCommandTensorArg(commandArgs, gemm.mTagTensorB, descB, deviceB.get());
+            setCommandTensorArg(commandArgs, gemm.mTagTensorC, descC, deviceC.get());
+            setCommandTensorArg(commandArgs, gemm.mTagTensorD, descD, deviceD.get());
 
-            commandArgs.setArgument(gemm.m_tagScalarAlpha, ArgumentType::Value, problem.alpha);
-            commandArgs.setArgument(gemm.m_tagScalarBeta, ArgumentType::Value, problem.beta);
+            commandArgs.setArgument(gemm.mTagScalarAlpha, ArgumentType::Value, problem.alpha);
+            commandArgs.setArgument(gemm.mTagScalarBeta, ArgumentType::Value, problem.beta);
             // seed doesn't seem to be relevant currently.
             //if(seed.has_value())
             //    commandArgs.setArgument(gemm.m_tagScalarSeed, ArgumentType::Value, seed.value());
@@ -520,14 +520,14 @@ namespace AddressCalculationTest
             // Create scratch space
             if(problem.streamK)
             {
-                commandArgs.setArgument(gemm.m_tagNumWGs, ArgumentType::Value, problem.numWGs);
+                commandArgs.setArgument(gemm.mTagNumWGs, ArgumentType::Value, problem.numWGs);
             }
 
             // ?? Is it correct to use original commandKernel or addrTestCommandKernel
             auto scratchSpaceRequired
                 = commandKernelPtr->scratchSpaceRequired(commandArgs.runtimeArguments());
             auto deviceScratch = make_shared_device<uint8_t>(scratchSpaceRequired, 0);
-            commandArgs.setArgument(gemm.m_tagScratch, ArgumentType::Value, deviceScratch.get());
+            commandArgs.setArgument(gemm.mTagScratch, ArgumentType::Value, deviceScratch.get());
         }
 
         void generateOrigKernelByProlog()
diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp
index 294b37f4..1449d5c5 100644
--- a/test/common/CommonGraphs.cpp
+++ b/test/common/CommonGraphs.cpp
@@ -182,10 +182,10 @@ namespace rocRollerTest::Graphs
     {
     }
     GEMM::GEMM(DataType ta, DataType tb, DataType tc, DataType td)
-        : m_ta(ta)
-        , m_tb(tb)
-        , m_tc(tc)
-        , m_td(td)
+        : mTa(ta)
+        , mTb(tb)
+        , mTc(tc)
+        , mTd(td)
     {
     }
 
@@ -200,27 +200,27 @@ namespace rocRollerTest::Graphs
                                               ? std::vector<size_t>({(size_t)0, (size_t)1})
                                               : std::vector<size_t>({});
 
-        m_tagTensorA = m_command->addOperation(rocRoller::Operations::Tensor(
-            2, m_ta, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A
+        mTagTensorA = m_command->addOperation(rocRoller::Operations::Tensor(
+            2, mTa, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A
 
-        m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorA));
+        m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorA));
 
-        m_tagTensorB = m_command->addOperation(rocRoller::Operations::Tensor(
-            2, m_tb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B
-        m_tagB       = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorB));
+        mTagTensorB = m_command->addOperation(rocRoller::Operations::Tensor(
+            2, mTb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B
+        m_tagB      = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorB));
 
-        m_tagTensorC
-            = m_command->addOperation(rocRoller::Operations::Tensor(2, m_tc, oneStridesN)); // C
-        m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(m_tagTensorC));
+        mTagTensorC
+            = m_command->addOperation(rocRoller::Operations::Tensor(2, mTc, oneStridesN)); // C
+        m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorC));
 
-        m_tagScalarAlpha
+        mTagScalarAlpha
             = m_command->addOperation(rocRoller::Operations::Scalar(DataType::Float)); // alpha
         auto tagLoadAlpha
-            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(m_tagScalarAlpha));
+            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarAlpha));
 
-        m_tagScalarBeta  = m_command->addOperation(rocRoller::Operations::Scalar(m_tc)); // beta
-        auto tagLoadBeta = m_command->addOperation(
-            rocRoller::Operations::T_Load_Scalar(m_tagScalarBeta)); // beta
+        mTagScalarBeta = m_command->addOperation(rocRoller::Operations::Scalar(mTc)); // beta
+        auto tagLoadBeta
+            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarBeta)); // beta
 
         auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B
 
@@ -244,23 +244,23 @@ namespace rocRollerTest::Graphs
         }
         m_command->addOperation(std::move(execute));
 
-        m_tagTensorD
-            = m_command->addOperation(rocRoller::Operations::Tensor(2, m_td, oneStridesN)); // D
-        m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, m_tagTensorD)); // D
+        mTagTensorD
+            = m_command->addOperation(rocRoller::Operations::Tensor(2, mTd, oneStridesN)); // D
+        m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, mTagTensorD)); // D
 
         if(m_problem.streamK)
         {
-            m_tagNumWGs    = m_command->allocateTag();
+            mTagNumWGs     = m_command->allocateTag();
             auto numWGsArg = m_command->allocateArgument(DataType::UInt32,
-                                                         m_tagNumWGs,
+                                                         mTagNumWGs,
                                                          ArgumentType::Value,
                                                          DataDirection::ReadOnly,
                                                          rocRoller::NUMWGS);
         }
 
-        m_tagScratch = m_command->allocateTag();
+        mTagScratch = m_command->allocateTag();
         m_command->allocateArgument(VariableType(DataType::UInt32, PointerType::PointerGlobal),
-                                    m_tagScratch,
+                                    mTagScratch,
                                     ArgumentType::Value,
                                     DataDirection::ReadWrite,
                                     rocRoller::SCRATCH);
diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp
index 7485c5cc..95bd6ee5 100644
--- a/test/common/common/CommonGraphs.hpp
+++ b/test/common/common/CommonGraphs.hpp
@@ -178,11 +178,11 @@ namespace rocRollerTest
             GEMMProblem const&   getProblem() const;
             CommandParametersPtr getCommandParameters() const;
 
-            rocRoller::Operations::OperationTag m_tagTensorA, m_tagTensorB, m_tagTensorC,
-                m_tagTensorD, m_tagScalarAlpha, m_tagScalarBeta, m_tagScalarSeed, m_tagScratch;
-            rocRoller::Operations::OperationTag m_tagNumWGs;
+            rocRoller::Operations::OperationTag mTagTensorA, mTagTensorB, mTagTensorC, mTagTensorD,
+                mTagScalarAlpha, mTagScalarBeta, mTagScalarSeed, mTagScratch;
+            rocRoller::Operations::OperationTag mTagNumWGs;
 
-            DataType m_ta, m_tb, m_tc, m_td;
+            DataType mTa, mTb, mTc, mTd;
 
         private:
             void createCommand();

From df6336882776fe3aa3a72049ecf3f5b6555bda23 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi@amd.com>
Date: Fri, 7 Mar 2025 03:40:35 +0000
Subject: [PATCH 10/32] Reorganize the tests to cover multiple sizes and data
 types.

---
 test/catch/AddressCalculationTest.cpp | 220 +++++++++++++-------------
 test/common/common/TestValues.hpp     |  11 ++
 2 files changed, 121 insertions(+), 110 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 355260e6..49aa0fc1 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -3,6 +3,7 @@
 #include "TestContext.hpp"
 #include <common/CommonGraphs.hpp>
 #include <common/GEMMProblem.hpp>
+#include <common/TestValues.hpp>
 #include <common/Utilities.hpp>
 #include <common/WidenTo64bit.hpp>
 #include <rocRoller/CodeGen/ArgumentLoader.hpp>
@@ -361,85 +362,6 @@ namespace AddressCalculationTest
             };
         }
 
-        auto kb_equal(ContextPtr                                      context,
-                      const std::vector<Expression::ExpressionPtr>&   input,
-                      const std::vector<Expression::ExpressionPtr>&   widenedInput,
-                      std::array<Expression::ExpressionPtr, 3> const& workitemCount,
-                      std::array<unsigned int, 3> const&              workgroupSize)
-        {
-            auto allone_uint64
-                = std::make_shared<Expression::Expression>(static_cast<uint64_t>(0xFFFFFFFF));
-
-            return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]()
-                       -> Generator<Instruction> {
-                // store base addr
-                Register::ValuePtr s_ptr;
-
-                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
-
-                Register::ValuePtr s_ptr2;
-
-                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
-
-                // 2-D
-                auto compare_res_pointer
-                    = get64BitVectorOffset(context, workitemCount, workgroupSize);
-                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
-
-                Register::ValuePtr v_offset = nullptr;
-                co_yield Expression::generate(
-                    v_offset, compare_res_pointer + s_ptr->expression(), context);
-
-                // may not be needed.
-                //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug")));
-
-                // boolean_diff was allocated to s[0:1]
-                // diff should be computed per lane,
-                // but is_zero_diff is actually one-bit
-                auto               boolean_true = Register::Value::WavefrontPlaceholder(context);
-                Register::ValuePtr v_allone;
-                co_yield Expression::generate(v_allone, allone_uint64, context);
-
-                co_yield context->copier()->copy(
-                    boolean_true, v_allone, "set to true for all lanes");
-
-                Register::ValuePtr temp_res;
-                for(int i = 0, size = input.size(); i < size; i++)
-                {
-                    // Compute the value 1
-                    Register::ValuePtr v_value_1 = nullptr;
-                    co_yield Expression::generate(v_value_1, input[i], context);
-
-                    // Compute the value 2
-                    Register::ValuePtr v_value_2 = nullptr;
-                    co_yield Expression::generate(v_value_2, widenedInput[i], context);
-
-                    // Compute diff (value_1 - value_2)
-                    auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
-                    {
-                        auto boolType = resultVariableType(is_zero_diff).dataType;
-                        AssertFatal(
-                            boolType == DataType::Bool64, "is_zero type {}", toString(boolType));
-                    }
-
-                    // boolean_true = boolean_true | is_zero_diff
-                    auto accumRes = std::make_shared<Expression::Expression>(
-                        Expression::BitwiseAnd{boolean_true->expression(), is_zero_diff, "accum"});
-                    {
-                        auto accumType = resultVariableType(accumRes).dataType;
-                        AssertFatal(
-                            accumType == DataType::Bool64, "accum type {}", toString(accumType));
-                    }
-                    co_yield Expression::generate(boolean_true, accumRes, context);
-                }
-
-                auto v_value = Register::Value::Placeholder(
-                    context, Register::Type::Vector, DataType::UInt64, 1);
-                co_yield context->copier()->copy(v_value, boolean_true, "Move value");
-                co_yield context->mem()->storeGlobal(v_offset, v_value, 0, 8);
-            };
-        }
-
         // For now, mainly for debugging, directly copy the two results values.
         // workitemCount is passed as argument. Since it is 64-bit
         // different global_store_dwordx2 format should be used.
@@ -792,12 +714,86 @@ namespace AddressCalculationTest
                 Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
             }
 
-            m_context->schedule(kb_equal(
-                m_context,
-                indexExprPtrs,
-                widenedExprPtrs,
-                m_context->kernel()->workitemCount(),
-                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+            auto allone_uint64
+                = std::make_shared<Expression::Expression>(static_cast<uint64_t>(0xFFFFFFFF));
+
+            auto kb = [&]()
+                // return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]()
+                -> Generator<Instruction> {
+                    // store base addr
+                    Register::ValuePtr s_ptr;
+
+                    co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr);
+
+                    Register::ValuePtr s_ptr2;
+
+                    co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2);
+
+                    // 2-D
+                    auto compare_res_pointer = get64BitVectorOffset(
+                        m_context,
+                        m_context->kernel()->workitemCount(),
+                        (m_commandKernel->getCommandParameters()->getManualWorkgroupSize())
+                            .value());
+                    Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                    Register::ValuePtr v_offset = nullptr;
+                    co_yield Expression::generate(
+                        v_offset, compare_res_pointer + s_ptr->expression(), m_context);
+
+                    // may not be needed.
+                    //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug")));
+
+                    // boolean_diff was allocated to s[0:1]
+                    // diff should be computed per lane,
+                    // but is_zero_diff is actually one-bit
+                    auto boolean_true = Register::Value::WavefrontPlaceholder(m_context);
+                    Register::ValuePtr v_allone;
+                    co_yield Expression::generate(v_allone, allone_uint64, m_context);
+
+                    co_yield m_context->copier()->copy(
+                        boolean_true, v_allone, "set to true for all lanes");
+
+                    Register::ValuePtr temp_res;
+                    for(int i = 0, size = indexExprPtrs.size(); i < size; i++)
+                    {
+                        // Compute the value 1
+                        Register::ValuePtr v_value_1 = nullptr;
+                        co_yield Expression::generate(v_value_1, indexExprPtrs[i], m_context);
+
+                        // Compute the value 2
+                        Register::ValuePtr v_value_2 = nullptr;
+                        co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context);
+
+                        // Compute diff (value_1 - value_2)
+                        auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
+                        {
+                            auto boolType = resultVariableType(is_zero_diff).dataType;
+                            AssertFatal(boolType == DataType::Bool64,
+                                        "is_zero type {}",
+                                        toString(boolType));
+                        }
+
+                        // boolean_true = boolean_true | is_zero_diff
+                        auto accumRes
+                            = std::make_shared<Expression::Expression>(Expression::BitwiseAnd{
+                                boolean_true->expression(), is_zero_diff, "accum"});
+                        {
+                            auto accumType = resultVariableType(accumRes).dataType;
+                            AssertFatal(accumType == DataType::Bool64,
+                                        "accum type {}",
+                                        toString(accumType));
+                        }
+                        co_yield Expression::generate(boolean_true, accumRes, m_context);
+                    }
+
+                    auto v_value = Register::Value::Placeholder(
+                        m_context, Register::Type::Vector, DataType::UInt64, 1);
+                    co_yield m_context->copier()->copy(v_value, boolean_true, "Move value");
+                    co_yield m_context->mem()->storeGlobal(v_offset, v_value, 0, 8);
+                };
+
+            m_context->schedule(kb());
 
             auto k = m_context->kernel();
             m_context->schedule(k->postamble());
@@ -834,33 +830,37 @@ namespace AddressCalculationTest
         std::vector<uint64_t>                    m_hostBuffer2;
     };
 
-    TEST_CASE("address calculation test", "[expression][gpu][equal][float][128_128]")
-    {
-        auto context = TestContext::ForTestDevice();
-
-        GEMMProblem                 problem{.m = 128, .n = 128};
-        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
-        gemm.setProblem(problem);
-
-        AddressCalculationTest kernel(context.get(), problem, gemm);
-        kernel.test_equal();
-    }
-
-    TEST_CASE("address calculation test", "[expression][gpu][equal][float][512_512]")
+    TEST_CASE("address calculation test generate and run", "[expression][gpu]")
     {
-        auto context = TestContext::ForTestDevice();
+        // Single here means applied to all three A, B, C matrices.
+        // TODO: Add more dataTypes
+        std::vector<DataType> singleDataTypes = {DataType::Float};
 
-        GEMMProblem                 problem{.m = 512, .n = 512};
-        rocRollerTest::Graphs::GEMM gemm(DataType::Float);
-        gemm.setProblem(problem);
-
-        AddressCalculationTest kernel(context.get(), problem, gemm);
-        kernel.test_equal();
+        for(auto dataType : singleDataTypes)
+        {
+            for(auto [m, n, macM, macN] : TestValues::gemmProblemSizes)
+            {
+                // Come up with a string from problem_size and data type, to be given to ForTestDevice();
+                auto suffixForKernelName = std::to_string(m) + "x" + std::to_string(n) + "_"
+                                           + std::to_string(macM) + "_" + std::to_string(macN);
+                auto context = TestContext::ForTestDevice({}, suffixForKernelName);
+
+                GEMMProblem                 problem{.m = m, .n = n, .macM = macM, .macN = macN};
+                rocRollerTest::Graphs::GEMM gemm(dataType);
+                gemm.setProblem(problem);
+                CAPTURE(dataType, m, n, macM, macN);
+
+                AddressCalculationTest kernel(context.get(), problem, gemm);
+                // Generate a kernel for testing address calculation and run.
+                // Verification of the result is done.
+                kernel.test_equal();
+            }
+        }
     }
 
-    TEST_CASE("address calculation test", "[expression][gpu][equal_one][float][128_128]")
+    TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]")
     {
-        auto context = TestContext::ForTestDevice({}, "equal_one_128x128");
+        auto context = TestContext::ForTestDevice({}, "128x128_one_pair");
 
         GEMMProblem                 problem{.m = 128, .n = 128};
         rocRollerTest::Graphs::GEMM gemm(DataType::Float);
@@ -870,7 +870,7 @@ namespace AddressCalculationTest
         kernel.test_equal_one();
     }
 
-    TEST_CASE("address calculation test", "[expression][gpu][implicit_workitemcount]")
+    TEST_CASE("address calculation test implicit workitemcount", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "impl_workitemcnt");
 
@@ -882,9 +882,9 @@ namespace AddressCalculationTest
         kernel.test_implicit_workitemcount();
     }
 
-    TEST_CASE("address calculation test", "[expression][gpu][sanity_indices]")
+    TEST_CASE("address calculation test sanity check", "[expression][gpu]")
     {
-        auto context = TestContext::ForTestDevice({}, "sanity_indices");
+        auto context = TestContext::ForTestDevice({}, "128x128_sanity_indices");
 
         GEMMProblem                 problem{.m = 128, .n = 128};
         rocRollerTest::Graphs::GEMM gemm(DataType::Float);
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index 2b123fca..1d0bf507 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -112,6 +112,17 @@ namespace TestValues
                                                12981.0,
                                                42e5};
 
+    // Portions of GEMMProblem
+    struct gemmProblemSize
+    {
+        int m;
+        int n;
+        int macM;
+        int macN;
+    };
+
+    inline std::vector<gemmProblemSize> gemmProblemSizes = {{128, 128, 64, 64}, {512, 512, 64, 64}};
+
     template <typename T>
     struct ByType
     {

From d3cec12605c1774774b2ca3a683886a7c79429be Mon Sep 17 00:00:00 2001
From: "yoonchoi@amd.com" <yoonchoi@amd.com>
Date: Mon, 10 Mar 2025 19:52:39 +0000
Subject: [PATCH 11/32] Using GEMM(DataType ..) in place of GEMM<typedef Type>

---
 test/catch/KernelGraphRemoveDuplicatesTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
index 9bb470c5..df871c0b 100644
--- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp
+++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
@@ -42,7 +42,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]")
     using namespace rocRoller::KernelGraph::ControlGraph;
 
     auto ctx     = TestContext::ForDefaultTarget().get();
-    auto example = rocRollerTest::Graphs::GEMM<float>();
+    auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
     example.setTileSize(128, 128, 32);
     example.setMFMA(32, 32, 16, 1);

From efa60dbceb89bce17a741624035c00ca0d8697a2 Mon Sep 17 00:00:00 2001
From: Lauren Wrubleski <Lauren.Wrubleski@amd.com>
Date: Mon, 10 Mar 2025 20:43:49 +0000
Subject: [PATCH 12/32] Allow configuration of mxDataGenerator source

---
 CMakeLists.txt           | 3 +++
 cmake/Dependencies.cmake | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c539139..066cff06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,6 +121,9 @@ option(BUILD_TESTING "Build rocRoller test clients" ON)
 option(BUILD_DOCS "Build rocRoller documentation" ON)
 option(BUILD_VERBOSE "Output additional build information" OFF)
 
+option(MXDATAGENERATOR_SSH "Fetch mxDataGenerator via SSH" OFF)
+set(MXDATAGENERATOR_GIT_URL "github.com" CACHE STRING "Base Git URL to fetch mxDataGenerator from.")
+
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index c47ee114..66d1c8dc 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -441,6 +441,11 @@ endfunction()
 
 function(_fetch_mxDataGenerator VERSION HASH)
     _determine_git_tag(v main)
+    if(MXDATAGENERATOR_SSH)
+        set(mxDataGenerator_url "git@${MXDATAGENERATOR_GIT_URL}:ROCm/mxDataGenerator.git")
+    else()
+        set(mxDataGenerator_url "https://${MXDATAGENERATOR_GIT_URL}/ROCm/mxDataGenerator.git")
+    endif()
     FetchContent_Declare(
         mxDataGenerator
         GIT_REPOSITORY git@github.com:ROCm/mxDataGenerator.git

From 729a26622c610730e867d0a87893e0f2475855d2 Mon Sep 17 00:00:00 2001
From: Lauren Wrubleski <Lauren.Wrubleski@amd.com>
Date: Tue, 11 Mar 2025 19:15:08 +0000
Subject: [PATCH 13/32] actually use computed mxDataGenerator URL

---
 cmake/Dependencies.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 66d1c8dc..1b161f86 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -448,7 +448,7 @@ function(_fetch_mxDataGenerator VERSION HASH)
     endif()
     FetchContent_Declare(
         mxDataGenerator
-        GIT_REPOSITORY git@github.com:ROCm/mxDataGenerator.git
+        GIT_REPOSITORY ${mxDataGenerator_url}
         GIT_TAG ${GIT_TAG}
     )
     FetchContent_MakeAvailable(mxDataGenerator)

From efba1dcd0a33555959f9ffabfa970d5667a893da Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Tue, 11 Mar 2025 21:45:39 -0400
Subject: [PATCH 14/32] LWP100-1465 Common GEMM graph: replacing templatized
 dataType with member variable

---
 test/catch/CommandTest.cpp                    |   8 +-
 test/catch/IdentifyParallelDimensionsTest.cpp |   4 +-
 .../catch/KernelGraphRemoveDuplicatesTest.cpp |  12 +-
 test/common/CommonGraphs.cpp                  | 239 ++++++++++++++++++
 test/common/common/CommonGraphs.hpp           |  24 +-
 test/common/common/CommonGraphs_impl.hpp      | 126 ---------
 test/unit/KernelGraphTest.cpp                 |  16 +-
 .../unit/KernelGraphTest/UpdateParameters.cpp |  12 +-
 8 files changed, 286 insertions(+), 155 deletions(-)

diff --git a/test/catch/CommandTest.cpp b/test/catch/CommandTest.cpp
index 85bce25a..2bfd8918 100644
--- a/test/catch/CommandTest.cpp
+++ b/test/catch/CommandTest.cpp
@@ -38,9 +38,9 @@ namespace CommandTest
     {
         SECTION("GEMM/TileAdd")
         {
-            auto example1 = *rocRollerTest::Graphs::GEMM<float>().getCommand();
-            auto example2 = *rocRollerTest::Graphs::GEMM<float>().getCommand();
-            auto example3 = *rocRollerTest::Graphs::GEMM<Half>().getCommand();
+            auto example1 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand();
+            auto example2 = *rocRollerTest::Graphs::GEMM(DataType::Float).getCommand();
+            auto example3 = *rocRollerTest::Graphs::GEMM(DataType::Half).getCommand();
             auto example4 = *rocRollerTest::Graphs::TileDoubleAdd<Half>().getCommand();
 
             CHECK(example1 == example2);
@@ -58,7 +58,7 @@ namespace CommandTest
     {
         SECTION("GEMM")
         {
-            auto example = rocRollerTest::Graphs::GEMM<float>();
+            auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
             auto command0 = example.getCommand();
             auto yaml     = Command::toYAML(*command0);
diff --git a/test/catch/IdentifyParallelDimensionsTest.cpp b/test/catch/IdentifyParallelDimensionsTest.cpp
index 7f46a0f1..62973404 100644
--- a/test/catch/IdentifyParallelDimensionsTest.cpp
+++ b/test/catch/IdentifyParallelDimensionsTest.cpp
@@ -74,7 +74,7 @@ TEST_CASE("identifyParallelDimensionSets works for GEMM", "[kernel-graph]")
     using namespace rocRoller;
     auto ctx = TestContext::ForDefaultTarget();
 
-    auto example = rocRollerTest::Graphs::GEMM<float>();
+    auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
     auto kgraph = KernelGraph::translate(example.getCommand());
 
@@ -175,7 +175,7 @@ SCENARIO("IdentifyParallelDimensions transformation works for GEMM", "[kernel-gr
     using namespace rocRoller;
     auto ctx = TestContext::ForDefaultTarget();
 
-    auto example = rocRollerTest::Graphs::GEMM<float>();
+    auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
     GIVEN("The initial kernel graph for a GEMM")
     {
diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
index 9bb470c5..01812d2a 100644
--- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp
+++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
@@ -42,7 +42,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]")
     using namespace rocRoller::KernelGraph::ControlGraph;
 
     auto ctx     = TestContext::ForDefaultTarget().get();
-    auto example = rocRollerTest::Graphs::GEMM<float>();
+    auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
     example.setTileSize(128, 128, 32);
     example.setMFMA(32, 32, 16, 1);
@@ -72,16 +72,16 @@ TEST_CASE("Remove duplicates", "[kernel-graph]")
 
     // LoadTiled: A A, B B, C C
     // After removing 2x1 jamming: A, B, C C
-    CHECK(graph0.control.getElements<LoadTiled>().to<std::vector>().size() == 6);
-    CHECK(graph1.control.getElements<LoadTiled>().to<std::vector>().size() == 4);
+    CHECK(graph0.control.getElements<LoadTiled>().to<std::vector>().size() == 3);
+    CHECK(graph1.control.getElements<LoadTiled>().to<std::vector>().size() == 3);
 
     // StoreLDSTile: A A, B B
     // After removing 2x1 jamming: A, B
-    CHECK(graph0.control.getElements<StoreLDSTile>().to<std::vector>().size() == 4);
+    CHECK(graph0.control.getElements<StoreLDSTile>().to<std::vector>().size() == 2);
     CHECK(graph1.control.getElements<StoreLDSTile>().to<std::vector>().size() == 2);
 
     // LoadLDSTile: A A A A, B B B B
     // After removing 2x1 jamming: A A A A, B B
-    CHECK(graph0.control.getElements<LoadLDSTile>().to<std::vector>().size() == 8);
-    CHECK(graph1.control.getElements<LoadLDSTile>().to<std::vector>().size() == 6);
+    CHECK(graph0.control.getElements<LoadLDSTile>().to<std::vector>().size() == 4);
+    CHECK(graph1.control.getElements<LoadLDSTile>().to<std::vector>().size() == 4);
 }
diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp
index c0271b56..5e1b4adf 100644
--- a/test/common/CommonGraphs.cpp
+++ b/test/common/CommonGraphs.cpp
@@ -193,4 +193,243 @@ namespace rocRollerTest::Graphs
         return params;
     }
 
+    GEMM::GEMM(DataType ta)
+        : GEMM(ta, ta)
+    {
+    }
+    GEMM::GEMM(DataType ta, DataType tb)
+        : GEMM(ta, tb, tb)
+    {
+    }
+    GEMM::GEMM(DataType ta, DataType tb, DataType tc)
+        : GEMM(ta, tb, tc, tc)
+    {
+    }
+    GEMM::GEMM(DataType ta, DataType tb, DataType tc, DataType td)
+        : mTa(ta)
+        , mTb(tb)
+        , mTc(tc)
+        , mTd(td)
+    {
+    }
+
+    void GEMM::createCommand()
+    {
+        m_command = std::make_shared<rocRoller::Command>();
+
+        std::vector<size_t> oneStridesN
+            = m_problem.literalStrides ? std::vector<size_t>({(size_t)1}) : std::vector<size_t>({});
+
+        std::vector<size_t> oneStridesT = m_problem.literalStrides
+                                              ? std::vector<size_t>({(size_t)0, (size_t)1})
+                                              : std::vector<size_t>({});
+
+        mTagTensorA = m_command->addOperation(rocRoller::Operations::Tensor(
+            2, mTa, m_problem.transA == "N" ? oneStridesN : oneStridesT)); // A
+
+        m_tagA = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorA));
+
+        mTagTensorB = m_command->addOperation(rocRoller::Operations::Tensor(
+            2, mTb, m_problem.transB == "N" ? oneStridesN : oneStridesT)); // B
+        m_tagB      = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorB));
+
+        mTagTensorC
+            = m_command->addOperation(rocRoller::Operations::Tensor(2, mTc, oneStridesN)); // C
+        m_tagC = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(mTagTensorC));
+
+        mTagScalarAlpha
+            = m_command->addOperation(rocRoller::Operations::Scalar(DataType::Float)); // alpha
+        auto tagLoadAlpha
+            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarAlpha));
+
+        mTagScalarBeta = m_command->addOperation(rocRoller::Operations::Scalar(mTc)); // beta
+        auto tagLoadBeta
+            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(mTagScalarBeta)); // beta
+
+        auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B
+
+        rocRoller::Operations::T_Execute execute(m_command->getNextTag());
+
+        auto tagBetaC
+            = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadBeta, m_tagC)); // beta * C
+
+        auto tagAlphaAB
+            = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadAlpha, tagAB)); // alpha * (A * B)
+
+        if(m_problem.betaInFma)
+        {
+            m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagBetaC, tagAlphaAB));
+            // alpha * (A * B) + beta * C
+        }
+        else
+        {
+            m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagAlphaAB, tagBetaC));
+            // alpha * (A * B) + beta * C
+        }
+        m_command->addOperation(std::move(execute));
+
+        mTagTensorD
+            = m_command->addOperation(rocRoller::Operations::Tensor(2, mTd, oneStridesN)); // D
+        m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, mTagTensorD)); // D
+
+        if(m_problem.streamK)
+        {
+            mTagNumWGs     = m_command->allocateTag();
+            auto numWGsArg = m_command->allocateArgument(DataType::UInt32,
+                                                         mTagNumWGs,
+                                                         ArgumentType::Value,
+                                                         DataDirection::ReadOnly,
+                                                         rocRoller::NUMWGS);
+        }
+
+        mTagScratch = m_command->allocateTag();
+        m_command->allocateArgument(VariableType(DataType::UInt32, PointerType::PointerGlobal),
+                                    mTagScratch,
+                                    ArgumentType::Value,
+                                    DataDirection::ReadWrite,
+                                    rocRoller::SCRATCH);
+    }
+
+    CommandPtr GEMM::getCommand()
+    {
+        if(!m_command)
+            createCommand();
+
+        return m_command;
+    }
+
+    KernelGraph GEMM::getKernelGraph()
+    {
+        return rocRoller::KernelGraph::translate(getCommand());
+    }
+
+    void GEMM::setTileSize(int m, int n, int k)
+    {
+        m_problem.macM = m;
+        m_problem.macN = n;
+        m_problem.macK = k;
+    }
+
+    void GEMM::setMFMA(int m, int n, int k, int b)
+    {
+        m_problem.waveM = m;
+        m_problem.waveN = n;
+        m_problem.waveK = k;
+        m_problem.waveB = b;
+    }
+
+    void GEMM::setUseLDS(bool a, bool b, bool d)
+    {
+        m_problem.loadLDSA  = a;
+        m_problem.loadLDSB  = b;
+        m_problem.storeLDSD = d;
+    }
+
+    void GEMM::setPrefetch(bool prefetch,
+                           int  prefetchInFlight,
+                           int  prefetchLDSFactor,
+                           bool prefetchMixMemOps)
+    {
+        m_problem.prefetch          = prefetch;
+        m_problem.prefetchInFlight  = prefetchInFlight;
+        m_problem.prefetchLDSFactor = prefetchLDSFactor;
+        m_problem.prefetchMixMemOps = prefetchMixMemOps;
+
+        m_problem.unrollK = prefetchInFlight;
+    }
+
+    void GEMM::setProblem(GEMMProblem const& problem)
+    {
+        m_problem = problem;
+    }
+
+    GEMMProblem const& GEMM::getProblem() const
+    {
+        return m_problem;
+    }
+
+    CommandParametersPtr GEMM::getCommandParameters() const
+    {
+        using namespace rocRoller::KernelGraph::CoordinateGraph;
+
+        auto params = std::make_shared<CommandParameters>();
+
+        params->setManualKernelDimension(2);
+
+        AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0,
+                    "Workgroup Size X must be multiply of wave front size");
+
+        uint wavetilePerWavefrontM
+            = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX;
+        uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY;
+
+        AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0,
+                    "WaveTile size mismatch (M)");
+        AssertFatal(m_problem.macN % (m_problem.waveN * wavetilePerWavefrontN) == 0,
+                    "WaveTile size mismatch (N)");
+
+        uint workgroupSizeX = m_problem.workgroupSizeX * m_problem.workgroupSizeY;
+        uint workgroupSizeY = 1;
+        params->setManualWorkgroupSize({workgroupSizeX, workgroupSizeY, 1});
+
+        auto macTileA
+            = MacroTile({m_problem.macM, m_problem.macK},
+                        LayoutType::MATRIX_A,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB},
+                        m_problem.loadLDSA ? MemoryType::LDS : MemoryType::WAVE);
+        auto macTileB
+            = MacroTile({m_problem.macK, m_problem.macN},
+                        LayoutType::MATRIX_B,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB},
+                        m_problem.loadLDSB ? MemoryType::LDS : MemoryType::WAVE);
+        auto macTileC
+            = MacroTile({m_problem.macM, m_problem.macN},
+                        LayoutType::MATRIX_ACCUMULATOR,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB});
+        auto macTileD
+            = MacroTile({m_problem.macM, m_problem.macN},
+                        LayoutType::MATRIX_ACCUMULATOR,
+                        {m_problem.waveM, m_problem.waveN, m_problem.waveK, m_problem.waveB},
+                        m_problem.storeLDSD ? MemoryType::LDS : MemoryType::WAVE);
+
+        params->setDimensionInfo(m_tagA, macTileA);
+        params->setDimensionInfo(m_tagB, macTileB);
+        params->setDimensionInfo(m_tagC, macTileC);
+        params->setDimensionInfo(m_tagD, macTileD);
+
+        // uint jammedM
+        //     = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX;
+        // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY;
+
+        Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1);
+        // Log::debug("GEMM jamming {} {}", jammedM, jammedN);
+        // params->setWaveTilesPerWavefront(jammedM, jammedN);
+
+        params->setManualWavefrontCount(
+            {static_cast<uint>(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM),
+             static_cast<uint>(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)});
+
+        params->fuseLoops                     = m_problem.fuseLoops;
+        params->tailLoops                     = m_problem.tailLoops;
+        params->allowAmbiguousMemoryNodes     = m_problem.allowAmbiguousMemoryNodes;
+        params->unrollK                       = m_problem.unrollK;
+        params->packMultipleElementsInto1VGPR = m_problem.packMultipleElementsInto1VGPR;
+        params->prefetch                      = m_problem.prefetch;
+        params->prefetchInFlight              = m_problem.prefetchInFlight;
+        params->prefetchLDSFactor             = m_problem.prefetchLDSFactor;
+        params->prefetchMixMemOps             = m_problem.prefetchMixMemOps;
+        params->transposeMemoryAccess[LayoutType::MATRIX_A] = m_problem.transA == "T";
+        params->transposeMemoryAccess[LayoutType::MATRIX_B] = m_problem.transB == "T";
+        params->transposeMemoryAccess[LayoutType::None]     = true;
+
+        if(m_problem.streamK)
+        {
+            params->loopOverOutputTilesDimensions = {0, 1};
+            params->streamK                       = true;
+            params->streamKTwoTile                = m_problem.streamKTwoTile;
+        }
+
+        return params;
+    }
+
 }
diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp
index 4ee7a3a4..03e902c8 100644
--- a/test/common/common/CommonGraphs.hpp
+++ b/test/common/common/CommonGraphs.hpp
@@ -40,6 +40,8 @@
 #include <rocRoller/Operations/Command_fwd.hpp>
 #include <rocRoller/Operations/OperationTag.hpp>
 
+#include <common/GEMMProblem.hpp>
+
 namespace rocRollerTest
 {
     namespace Graphs
@@ -52,6 +54,7 @@ namespace rocRollerTest
         using ContextPtr                 = rocRoller::ContextPtr;
         using KernelArguments            = rocRoller::KernelArguments;
         using KernelGraph                = rocRoller::KernelGraph::KernelGraph;
+        using DataType                   = rocRoller::DataType;
 
         /**
          * @brief Graph for linear: alpha x + beta y.
@@ -178,11 +181,13 @@ namespace rocRollerTest
          * - Assign(D = alpha * AB + beta * C)
          * - StoreTiled(D)
          */
-        template <typename T>
         class GEMM
         {
         public:
-            GEMM();
+            GEMM(DataType ta);
+            GEMM(DataType ta, DataType tb);
+            GEMM(DataType ta, DataType tb, DataType tc);
+            GEMM(DataType ta, DataType tb, DataType tc, DataType td);
 
             CommandPtr  getCommand();
             KernelGraph getKernelGraph();
@@ -190,9 +195,21 @@ namespace rocRollerTest
             void setTileSize(int m, int n, int k);
             void setMFMA(int m, int n, int k, int b);
             void setUseLDS(bool a, bool b, bool d);
+            void setPrefetch(bool prefetch,
+                             int  prefetchInFlight,
+                             int  prefetchLDSFactor,
+                             bool prefetchMixMemOps);
+            void setProblem(GEMMProblem const& problem);
 
+            GEMMProblem const&   getProblem() const;
             CommandParametersPtr getCommandParameters() const;
 
+            rocRoller::Operations::OperationTag mTagTensorA, mTagTensorB, mTagTensorC, mTagTensorD,
+                mTagScalarAlpha, mTagScalarBeta, mTagScalarSeed, mTagScratch;
+            rocRoller::Operations::OperationTag mTagNumWGs;
+
+            DataType mTa, mTb, mTc, mTd;
+
         private:
             void createCommand();
 
@@ -202,7 +219,8 @@ namespace rocRollerTest
 
             rocRoller::Operations::OperationTag m_tagA, m_tagB, m_tagC, m_tagD;
 
-            CommandPtr m_command;
+            CommandPtr  m_command;
+            GEMMProblem m_problem;
         };
 
         /**
diff --git a/test/common/common/CommonGraphs_impl.hpp b/test/common/common/CommonGraphs_impl.hpp
index ec097436..9a8d6201 100644
--- a/test/common/common/CommonGraphs_impl.hpp
+++ b/test/common/common/CommonGraphs_impl.hpp
@@ -256,132 +256,6 @@ namespace rocRollerTest::Graphs
         return rocRoller::KernelGraph::translate(m_command);
     }
 
-    /*
-     * GEMM
-     */
-
-    template <typename T>
-    GEMM<T>::GEMM()
-    {
-        createCommand();
-    }
-
-    template <typename T>
-    void GEMM<T>::createCommand()
-    {
-        m_command = std::make_shared<rocRoller::Command>();
-
-        auto dataType = TypeInfo<T>::Var.dataType;
-
-        auto tagTensorA = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // A
-        m_tagA          = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(tagTensorA));
-
-        auto tagTensorB = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // B
-        m_tagB          = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(tagTensorB));
-
-        auto tagTensorC = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // C
-        m_tagC          = m_command->addOperation(rocRoller::Operations::T_Load_Tiled(tagTensorC));
-
-        auto tagScalarAlpha
-            = m_command->addOperation(rocRoller::Operations::Scalar(dataType)); // alpha
-        auto tagLoadAlpha
-            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(tagScalarAlpha));
-
-        auto tagScalarBeta
-            = m_command->addOperation(rocRoller::Operations::Scalar(dataType)); // beta
-        auto tagLoadBeta
-            = m_command->addOperation(rocRoller::Operations::T_Load_Scalar(tagScalarBeta)); // beta
-
-        auto tagAB = m_command->addOperation(rocRoller::Operations::T_Mul(m_tagA, m_tagB)); // A * B
-
-        rocRoller::Operations::T_Execute execute(m_command->getNextTag());
-        auto                             tagAlphaAB
-            = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadAlpha, tagAB)); // alpha * (A * B)
-        auto tagBetaC
-            = execute.addXOp(rocRoller::Operations::E_Mul(tagLoadBeta, m_tagC)); // beta * C
-        m_tagD = execute.addXOp(rocRoller::Operations::E_Add(tagAlphaAB, tagBetaC));
-        // alpha * (A * B) + beta * C
-        m_command->addOperation(std::move(execute));
-
-        auto tagTensorD = m_command->addOperation(rocRoller::Operations::Tensor(2, dataType)); // D
-        m_command->addOperation(rocRoller::Operations::T_Store_Tiled(m_tagD, tagTensorD)); // D
-    }
-
-    template <typename T>
-    CommandPtr GEMM<T>::getCommand()
-    {
-        return m_command;
-    }
-
-    template <typename T>
-    KernelGraph GEMM<T>::getKernelGraph()
-    {
-        return rocRoller::KernelGraph::translate(m_command);
-    }
-
-    template <typename T>
-    void GEMM<T>::setTileSize(int m, int n, int k)
-    {
-        m_macM = m;
-        m_macN = n;
-        m_macK = k;
-    }
-
-    template <typename T>
-    void GEMM<T>::setMFMA(int m, int n, int k, int b)
-    {
-        m_waveM = m;
-        m_waveN = n;
-        m_waveK = k;
-        m_waveB = b;
-    }
-
-    template <typename T>
-    void GEMM<T>::setUseLDS(bool a, bool b, bool d)
-    {
-        m_useLDSA = a;
-        m_useLDSB = b;
-        m_useLDSD = d;
-    }
-
-    template <typename T>
-    CommandParametersPtr GEMM<T>::getCommandParameters() const
-    {
-        using namespace rocRoller::KernelGraph::CoordinateGraph;
-
-        auto params = std::make_shared<CommandParameters>();
-
-        auto macTileA = MacroTile({m_macM, m_macK},
-                                  LayoutType::MATRIX_A,
-                                  {m_waveM, m_waveN, m_waveK, m_waveB},
-                                  m_useLDSA ? MemoryType::LDS : MemoryType::WAVE);
-        auto macTileB = MacroTile({m_macK, m_macN},
-                                  LayoutType::MATRIX_B,
-                                  {m_waveM, m_waveN, m_waveK, m_waveB},
-                                  m_useLDSB ? MemoryType::LDS : MemoryType::WAVE);
-        auto macTileC = MacroTile(
-            {m_macM, m_macN}, LayoutType::MATRIX_ACCUMULATOR, {m_waveM, m_waveN, m_waveK, m_waveB});
-
-        params->setDimensionInfo(m_tagA, macTileA);
-        params->setDimensionInfo(m_tagB, macTileB);
-        params->setDimensionInfo(m_tagC, macTileC);
-
-        // Workgroup size
-        uint wavefrontSize  = 64;
-        uint workgroupSizeX = 2 * wavefrontSize;
-        uint workgroupSizeY = 4;
-
-        uint jammedM = wavefrontSize * m_macM / m_waveM / workgroupSizeX;
-        uint jammedN = m_macN / m_waveN / workgroupSizeY;
-
-        Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1);
-        Log::debug("GEMM jamming {} {}", jammedM, jammedN);
-
-        params->setWaveTilesPerWavefront(jammedM, jammedN);
-
-        return params;
-    }
-
     /*
      * TileDoubleAdd
      */
diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp
index 8533b907..edfd3e01 100644
--- a/test/unit/KernelGraphTest.cpp
+++ b/test/unit/KernelGraphTest.cpp
@@ -1085,7 +1085,7 @@ namespace KernelGraphTest
 
     TEST_F(KernelGraphTest, LowerTensor)
     {
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         int macK  = 16;
         int waveK = 8;
@@ -1135,7 +1135,7 @@ namespace KernelGraphTest
 
         // Verify that loops have been unrolled
         auto unrolledForLoops = kgraphUnrolled.control.getNodes<ForLoopOp>().to<std::vector>();
-        EXPECT_EQ(unrolledForLoops.size(), 10); // main: X (Y (K K)) (Y (K K)); epilogue:  X (Y Y)
+        EXPECT_EQ(unrolledForLoops.size(), 5); // main: X (Y (K K)) (Y (K K)); epilogue:  X (Y Y)
 
         auto kgraphFused = kgraphUnrolled.transform(fuseLoopsTransform);
         kgraphFused      = kgraphFused.transform(removeDuplicatesTransform);
@@ -1145,7 +1145,7 @@ namespace KernelGraphTest
         EXPECT_EQ(fusedForLoops.size(), 5);
 
         auto fusedLoads = kgraphFused.control.getNodes<LoadTiled>().to<std::vector>();
-        EXPECT_EQ(fusedLoads.size(), 9); // 1 for A, 4 for B, 4 for C
+        EXPECT_EQ(fusedLoads.size(), 4); // 1 for A, 4 for B, 4 for C
 
         // Verify that single iteration loops have been removed.
         auto kgraphClean     = kgraphFused.transform(cleanLoopsTransform);
@@ -1154,13 +1154,13 @@ namespace KernelGraphTest
 
         // Verify that there is only a single StoreLDSTile node per K loop
         auto unrolledStoreLDS = kgraphUnrolled.control.getNodes<StoreLDSTile>().to<std::vector>();
-        EXPECT_EQ(unrolledStoreLDS.size(), 4);
+        EXPECT_EQ(unrolledStoreLDS.size(), 1);
 
         // Verify number of ComputeIndexes: A loads; A LDS loads; B loads; C load; D
         // store: 3 + (2+2) + 3 + 3 + 3 = 12
         kgraph1             = kgraph1.transform(addComputeIndexTransform);
         auto computeIndexes = kgraph1.control.getNodes<ComputeIndex>().to<std::vector>();
-        EXPECT_EQ(computeIndexes.size(), 16);
+        EXPECT_EQ(computeIndexes.size(), 15);
 
         // Verify number of Deallocates
         auto addDeallocate  = std::make_shared<AddDeallocate>();
@@ -1169,7 +1169,7 @@ namespace KernelGraphTest
         EXPECT_EQ(addDeallocates.size(), 16);
 
         auto storeLDS = kgraphUnrolled.control.getNodes<StoreLDSTile>().to<std::vector>();
-        EXPECT_EQ(storeLDS.size(), 4);
+        EXPECT_EQ(storeLDS.size(), 1);
 
         auto fusedStoreLDS = kgraphFused.control.getNodes<StoreLDSTile>().to<std::vector>();
         EXPECT_EQ(fusedStoreLDS.size(), 1);
@@ -1177,7 +1177,7 @@ namespace KernelGraphTest
 
     TEST_F(KernelGraphTest, InlineIncrement)
     {
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         example.setTileSize(128, 256, 8);
         example.setMFMA(32, 32, 2, 1);
@@ -2717,7 +2717,7 @@ namespace KernelGraphTest
     {
         using GD = Graph::Direction;
 
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         example.setTileSize(128, 256, 8);
         example.setMFMA(32, 32, 2, 1);
diff --git a/test/unit/KernelGraphTest/UpdateParameters.cpp b/test/unit/KernelGraphTest/UpdateParameters.cpp
index dd101c5c..e72f9960 100644
--- a/test/unit/KernelGraphTest/UpdateParameters.cpp
+++ b/test/unit/KernelGraphTest/UpdateParameters.cpp
@@ -88,7 +88,7 @@ namespace KernelGraphTest
     {
         using namespace rocRoller::KernelGraph;
 
-        auto example = rocRollerTest::Graphs::GEMM<float>();
+        auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
         int macK  = 16;
         int waveK = 8;
@@ -118,16 +118,16 @@ namespace KernelGraphTest
         // Now apply SetWorkitemCount and try again
         kgraph = kgraph.transform(std::make_shared<SetWorkitemCount>(m_context));
 
-        CommandArgumentPtr tensorDsizeX;
+        CommandArgumentPtr tensorAsizeX;
         {
             auto arguments = command->getArguments();
             for(auto argument : arguments)
             {
-                if(argument->name() == "Tensor_4_size_0")
-                    tensorDsizeX = argument;
+                if(argument->name() == "Tensor_0_size_0")
+                    tensorAsizeX = argument;
             }
         }
-        ASSERT_NE(tensorDsizeX, nullptr) << "D size not found";
+        ASSERT_NE(tensorAsizeX, nullptr) << "A size not found";
 
         workitemCount = m_context->kernel()->workitemCount();
 
@@ -135,7 +135,7 @@ namespace KernelGraphTest
         auto workgroupSizeX = Expression::literal(128u);
 
         auto expected
-            = (((tensorDsizeX->expression() + workgroupSizeX) - one) / workgroupSizeX) * one;
+            = (((tensorAsizeX->expression() + workgroupSizeX) - one) / workgroupSizeX) * one;
 
         EXPECT_TRUE(Expression::identical(expected, workitemCount[0]));
     }

From fada98a070ec03799500b26c2c95d0c59daee827 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 12 Mar 2025 21:50:38 +0000
Subject: [PATCH 15/32] Adding jamm factors to CommonGraph

---
 .../catch/KernelGraphRemoveDuplicatesTest.cpp |  2 +-
 test/common/CommonGraphs.cpp                  | 22 ++++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
index df871c0b..595793f0 100644
--- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp
+++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
@@ -44,7 +44,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]")
     auto ctx     = TestContext::ForDefaultTarget().get();
     auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
-    example.setTileSize(128, 128, 32);
+    example.setTileSize(128, 64, 32);
     example.setMFMA(32, 32, 16, 1);
     example.setUseLDS(true, true, false);
 
diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp
index 5e1b4adf..248d98d9 100644
--- a/test/common/CommonGraphs.cpp
+++ b/test/common/CommonGraphs.cpp
@@ -359,9 +359,16 @@ namespace rocRollerTest::Graphs
         AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0,
                     "Workgroup Size X must be multiply of wave front size");
 
+        // i.e. jammedM
         uint wavetilePerWavefrontM
             = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX;
+        AssertFatal(wavetilePerWavefrontM != 0,
+                    "Wavetiles per wavefront in M should be positive integer");
+
+        // i.e. jammedN
         uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY;
+        AssertFatal(wavetilePerWavefrontN != 0,
+                    "Wavetiles per wavefront in N should be positive integer");
 
         AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0,
                     "WaveTile size mismatch (M)");
@@ -397,14 +404,19 @@ namespace rocRollerTest::Graphs
         params->setDimensionInfo(m_tagC, macTileC);
         params->setDimensionInfo(m_tagD, macTileD);
 
-        // uint jammedM
-        //     = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX;
-        // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY;
+#if 0
+        uint jammedM
+             = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX;
+        uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY;
 
         Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1);
-        // Log::debug("GEMM jamming {} {}", jammedM, jammedN);
-        // params->setWaveTilesPerWavefront(jammedM, jammedN);
+        Log::debug("GEMM jamming {} {}", jammedM, jammedN);
+        params->setWaveTilesPerWavefront(jammedM, jammedN);
+#endif
+        Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1);
+        Log::debug("GEMM jamming {} {}", wavetilePerWavefrontM, wavetilePerWavefrontN);
 
+        params->setWaveTilesPerWavefront(wavetilePerWavefrontM, wavetilePerWavefrontN);
         params->setManualWavefrontCount(
             {static_cast<uint>(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM),
              static_cast<uint>(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)});

From 86d0b729d3b766ba0a236d9d204e6b911b9f0517 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 12 Mar 2025 22:36:44 +0000
Subject: [PATCH 16/32] Chaniging macro tile size for a unit test to match
 jamming factors after changes in GEMM graph

---
 test/unit/KernelGraphTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp
index d14e95a4..aa31ef94 100644
--- a/test/unit/KernelGraphTest.cpp
+++ b/test/unit/KernelGraphTest.cpp
@@ -1090,7 +1090,7 @@ namespace KernelGraphTest
         int macK  = 16;
         int waveK = 8;
 
-        example.setTileSize(128, 256, macK);
+        example.setTileSize(128, 128, macK);
         example.setMFMA(32, 32, waveK, 1);
         example.setUseLDS(true, false, false);
 

From a6c9f49de30e548cb1d88623af500e7050220591 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 12 Mar 2025 23:27:03 +0000
Subject: [PATCH 17/32] Restore jamming factors in common GEMM graph and
 restore original tests

---
 .../catch/KernelGraphRemoveDuplicatesTest.cpp | 12 ++++----
 test/common/CommonGraphs.cpp                  | 29 +++++++++++++++----
 test/common/common/CommonGraphs.hpp           |  4 ---
 test/unit/KernelGraphTest.cpp                 | 12 ++++----
 4 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/test/catch/KernelGraphRemoveDuplicatesTest.cpp b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
index 01812d2a..595793f0 100644
--- a/test/catch/KernelGraphRemoveDuplicatesTest.cpp
+++ b/test/catch/KernelGraphRemoveDuplicatesTest.cpp
@@ -44,7 +44,7 @@ TEST_CASE("Remove duplicates", "[kernel-graph]")
     auto ctx     = TestContext::ForDefaultTarget().get();
     auto example = rocRollerTest::Graphs::GEMM(DataType::Float);
 
-    example.setTileSize(128, 128, 32);
+    example.setTileSize(128, 64, 32);
     example.setMFMA(32, 32, 16, 1);
     example.setUseLDS(true, true, false);
 
@@ -72,16 +72,16 @@ TEST_CASE("Remove duplicates", "[kernel-graph]")
 
     // LoadTiled: A A, B B, C C
     // After removing 2x1 jamming: A, B, C C
-    CHECK(graph0.control.getElements<LoadTiled>().to<std::vector>().size() == 3);
-    CHECK(graph1.control.getElements<LoadTiled>().to<std::vector>().size() == 3);
+    CHECK(graph0.control.getElements<LoadTiled>().to<std::vector>().size() == 6);
+    CHECK(graph1.control.getElements<LoadTiled>().to<std::vector>().size() == 4);
 
     // StoreLDSTile: A A, B B
     // After removing 2x1 jamming: A, B
-    CHECK(graph0.control.getElements<StoreLDSTile>().to<std::vector>().size() == 2);
+    CHECK(graph0.control.getElements<StoreLDSTile>().to<std::vector>().size() == 4);
     CHECK(graph1.control.getElements<StoreLDSTile>().to<std::vector>().size() == 2);
 
     // LoadLDSTile: A A A A, B B B B
     // After removing 2x1 jamming: A A A A, B B
-    CHECK(graph0.control.getElements<LoadLDSTile>().to<std::vector>().size() == 4);
-    CHECK(graph1.control.getElements<LoadLDSTile>().to<std::vector>().size() == 4);
+    CHECK(graph0.control.getElements<LoadLDSTile>().to<std::vector>().size() == 8);
+    CHECK(graph1.control.getElements<LoadLDSTile>().to<std::vector>().size() == 6);
 }
diff --git a/test/common/CommonGraphs.cpp b/test/common/CommonGraphs.cpp
index 5e1b4adf..644b62dd 100644
--- a/test/common/CommonGraphs.cpp
+++ b/test/common/CommonGraphs.cpp
@@ -356,12 +356,33 @@ namespace rocRollerTest::Graphs
 
         params->setManualKernelDimension(2);
 
+        AssertFatal(m_problem.m % m_problem.macM == 0,
+                    "MacroTile size mismatch (M)",
+                    ShowValue(m_problem.m),
+                    ShowValue(m_problem.macM));
+        AssertFatal(m_problem.n % m_problem.macN == 0,
+                    "MacroTile size mismatch (N)",
+                    ShowValue(m_problem.n),
+                    ShowValue(m_problem.macN));
+
         AssertFatal(m_problem.workgroupSizeX % m_problem.wavefrontSize == 0,
                     "Workgroup Size X must be multiply of wave front size");
 
+        AssertFatal(m_problem.macM % m_problem.waveM == 0,
+                    "Macrotile size must be a multiple of wavetile size");
+        AssertFatal(m_problem.macN % m_problem.waveN == 0,
+                    "Macrotile size must be a multiple of wavetile size");
+
+        // i.e. jammedM
         uint wavetilePerWavefrontM
             = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / m_problem.workgroupSizeX;
+        AssertFatal(wavetilePerWavefrontM > 0,
+                    "Wavetiles per wavefront in M should be positive integer");
+
+        // i.e. jammedN
         uint wavetilePerWavefrontN = m_problem.macN / m_problem.waveN / m_problem.workgroupSizeY;
+        AssertFatal(wavetilePerWavefrontN > 0,
+                    "Wavetiles per wavefront in N should be positive integer");
 
         AssertFatal(m_problem.macM % (m_problem.waveM * wavetilePerWavefrontM) == 0,
                     "WaveTile size mismatch (M)");
@@ -397,14 +418,10 @@ namespace rocRollerTest::Graphs
         params->setDimensionInfo(m_tagC, macTileC);
         params->setDimensionInfo(m_tagD, macTileD);
 
-        // uint jammedM
-        //     = m_problem.wavefrontSize * m_problem.macM / m_problem.waveM / workgroupSizeX;
-        // uint jammedN = m_problem.macN / m_problem.waveN / workgroupSizeY;
-
         Log::debug("GEMM workgroup sizes {} {} {}", workgroupSizeX, workgroupSizeY, 1);
-        // Log::debug("GEMM jamming {} {}", jammedM, jammedN);
-        // params->setWaveTilesPerWavefront(jammedM, jammedN);
+        Log::debug("GEMM jamming {} {}", wavetilePerWavefrontM, wavetilePerWavefrontN);
 
+        params->setWaveTilesPerWavefront(wavetilePerWavefrontM, wavetilePerWavefrontN);
         params->setManualWavefrontCount(
             {static_cast<uint>(m_problem.macM / m_problem.waveM / wavetilePerWavefrontM),
              static_cast<uint>(m_problem.macN / m_problem.waveN / wavetilePerWavefrontN)});
diff --git a/test/common/common/CommonGraphs.hpp b/test/common/common/CommonGraphs.hpp
index 03e902c8..ddb4b816 100644
--- a/test/common/common/CommonGraphs.hpp
+++ b/test/common/common/CommonGraphs.hpp
@@ -213,10 +213,6 @@ namespace rocRollerTest
         private:
             void createCommand();
 
-            int  m_macM, m_macN, m_macK;
-            int  m_waveM, m_waveN, m_waveK, m_waveB;
-            bool m_useLDSA = false, m_useLDSB = false, m_useLDSD = false;
-
             rocRoller::Operations::OperationTag m_tagA, m_tagB, m_tagC, m_tagD;
 
             CommandPtr  m_command;
diff --git a/test/unit/KernelGraphTest.cpp b/test/unit/KernelGraphTest.cpp
index edfd3e01..aa31ef94 100644
--- a/test/unit/KernelGraphTest.cpp
+++ b/test/unit/KernelGraphTest.cpp
@@ -1090,7 +1090,7 @@ namespace KernelGraphTest
         int macK  = 16;
         int waveK = 8;
 
-        example.setTileSize(128, 256, macK);
+        example.setTileSize(128, 128, macK);
         example.setMFMA(32, 32, waveK, 1);
         example.setUseLDS(true, false, false);
 
@@ -1135,7 +1135,7 @@ namespace KernelGraphTest
 
         // Verify that loops have been unrolled
         auto unrolledForLoops = kgraphUnrolled.control.getNodes<ForLoopOp>().to<std::vector>();
-        EXPECT_EQ(unrolledForLoops.size(), 5); // main: X (Y (K K)) (Y (K K)); epilogue:  X (Y Y)
+        EXPECT_EQ(unrolledForLoops.size(), 10); // main: X (Y (K K)) (Y (K K)); epilogue:  X (Y Y)
 
         auto kgraphFused = kgraphUnrolled.transform(fuseLoopsTransform);
         kgraphFused      = kgraphFused.transform(removeDuplicatesTransform);
@@ -1145,7 +1145,7 @@ namespace KernelGraphTest
         EXPECT_EQ(fusedForLoops.size(), 5);
 
         auto fusedLoads = kgraphFused.control.getNodes<LoadTiled>().to<std::vector>();
-        EXPECT_EQ(fusedLoads.size(), 4); // 1 for A, 4 for B, 4 for C
+        EXPECT_EQ(fusedLoads.size(), 9); // 1 for A, 4 for B, 4 for C
 
         // Verify that single iteration loops have been removed.
         auto kgraphClean     = kgraphFused.transform(cleanLoopsTransform);
@@ -1154,13 +1154,13 @@ namespace KernelGraphTest
 
         // Verify that there is only a single StoreLDSTile node per K loop
         auto unrolledStoreLDS = kgraphUnrolled.control.getNodes<StoreLDSTile>().to<std::vector>();
-        EXPECT_EQ(unrolledStoreLDS.size(), 1);
+        EXPECT_EQ(unrolledStoreLDS.size(), 4);
 
         // Verify number of ComputeIndexes: A loads; A LDS loads; B loads; C load; D
         // store: 3 + (2+2) + 3 + 3 + 3 = 12
         kgraph1             = kgraph1.transform(addComputeIndexTransform);
         auto computeIndexes = kgraph1.control.getNodes<ComputeIndex>().to<std::vector>();
-        EXPECT_EQ(computeIndexes.size(), 15);
+        EXPECT_EQ(computeIndexes.size(), 16);
 
         // Verify number of Deallocates
         auto addDeallocate  = std::make_shared<AddDeallocate>();
@@ -1169,7 +1169,7 @@ namespace KernelGraphTest
         EXPECT_EQ(addDeallocates.size(), 16);
 
         auto storeLDS = kgraphUnrolled.control.getNodes<StoreLDSTile>().to<std::vector>();
-        EXPECT_EQ(storeLDS.size(), 1);
+        EXPECT_EQ(storeLDS.size(), 4);
 
         auto fusedStoreLDS = kgraphFused.control.getNodes<StoreLDSTile>().to<std::vector>();
         EXPECT_EQ(fusedStoreLDS.size(), 1);

From 0f16b1f82f15e6e4a2100a231d87f8cc680a343f Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Thu, 13 Mar 2025 00:33:28 +0000
Subject: [PATCH 18/32] Adding filename in a correct CMakeLists.txt

---
 test/common/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/common/CMakeLists.txt b/test/common/CMakeLists.txt
index 0a2987f6..b25a2828 100644
--- a/test/common/CMakeLists.txt
+++ b/test/common/CMakeLists.txt
@@ -34,6 +34,7 @@ add_library(
         CommonGraphs.cpp
         mxDataGen.cpp
         Utilities.cpp
+        WidenTo64bit.cpp
 )
 
 target_compile_options(

From 6ab41fc0b641363f20fa017c09a2ffacb5054c2a Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Thu, 13 Mar 2025 03:03:49 +0000
Subject: [PATCH 19/32] For modulo/divide the second operand should be a leaf
 value

---
 test/common/WidenTo64bit.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/common/WidenTo64bit.cpp b/test/common/WidenTo64bit.cpp
index 7758628c..dfb21645 100644
--- a/test/common/WidenTo64bit.cpp
+++ b/test/common/WidenTo64bit.cpp
@@ -70,6 +70,7 @@ namespace rocRollerTest
         {
             // TODO: Add check that divisor is a CValue within (u)int32-bit
             Log::debug("Divisor: {} ", toString(expr.rhs));
+	    AssertFatal(isExpectedLeafType(expr.rhs));
 
             Expression::Divide cpy = expr;
             if(expr.lhs)
@@ -82,6 +83,7 @@ namespace rocRollerTest
         {
             // TODO: Add check that divisor is a CValue within (u)int32-bit
             Log::debug("Modulo: {} ", toString(expr.rhs));
+	    AssertFatal(isExpectedLeafType(expr.rhs));
 
             Expression::Modulo cpy = expr;
             if(expr.lhs)
@@ -243,6 +245,14 @@ namespace rocRollerTest
                         "workgroup/item indices ",
                         showValue);
         }
+
+	bool isExpectedLeafType(Expression::ExpressionPtr const& expr) const {
+            return std::holds_alternative<CommandArgumentValue>(*expr) ||
+                   std::holds_alternative<CommandArgumentPtr>(*expr) ||
+                   std::holds_alternative<Register::ValuePtr>(*expr) ||
+                   std::holds_alternative<AssemblyKernelArgumentPtr>(*expr);
+        }
+	
     };
 
     Expression::ExpressionPtr widenTo64bit(Expression::ExpressionPtr expr)

From b827c064e1ae58181e74505eb78f536297596fb7 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Thu, 13 Mar 2025 19:48:19 +0000
Subject: [PATCH 20/32] Adding license

---
 test/catch/AddressCalculationTest.cpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 49aa0fc1..4e415853 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -1,3 +1,28 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright 2025 AMD ROCm(TM) Software
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
 
 #include "CustomMatchers.hpp"
 #include "TestContext.hpp"

From bbb69a1cbb0175b2ef43bbe92485a6efae499e63 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Fri, 14 Mar 2025 20:52:48 +0000
Subject: [PATCH 21/32] Introducing GENERATE/DYNAMIC_SECTION and adding passing
 configurations

---
 .../InstructionValues/Register_impl.hpp       |   3 +
 .../FastDivision.cpp                          |   6 +
 test/catch/AddressCalculationTest.cpp         | 117 +++++++++++++-----
 test/common/common/TestValues.hpp             |  17 ++-
 4 files changed, 106 insertions(+), 37 deletions(-)

diff --git a/lib/include/rocRoller/InstructionValues/Register_impl.hpp b/lib/include/rocRoller/InstructionValues/Register_impl.hpp
index 1ecc5697..14b1c5f8 100644
--- a/lib/include/rocRoller/InstructionValues/Register_impl.hpp
+++ b/lib/include/rocRoller/InstructionValues/Register_impl.hpp
@@ -817,6 +817,9 @@ namespace rocRoller
             std::vector<int> coords;
             for(auto i : indices)
             {
+                if (i < 0 || (size_t)i >= m_allocationCoord.size()) {
+                    Log::debug("FA i {}", i);
+                }
                 AssertFatal(i >= 0 && (size_t)i < m_allocationCoord.size(),
                             "Register subset out of bounds.",
                             ShowValue(m_allocationCoord.size()),
diff --git a/lib/source/ExpressionTransformations/FastDivision.cpp b/lib/source/ExpressionTransformations/FastDivision.cpp
index c5af98e4..aeef0631 100644
--- a/lib/source/ExpressionTransformations/FastDivision.cpp
+++ b/lib/source/ExpressionTransformations/FastDivision.cpp
@@ -329,6 +329,9 @@ namespace rocRoller
         {
             unsigned int mask    = rhs - 1u;
             auto         new_rhs = literal(mask);
+#if 1
+            Log::debug("Yoonseo - umodulo - name {} toString {} mask {}", name(new_rhs), toString(new_rhs), mask);
+#endif
             return lhs & new_rhs;
         }
 
@@ -342,6 +345,9 @@ namespace rocRoller
             int          mask               = ~(rhs - 1);
 
             auto maskExpr               = literal(mask);
+#if 1
+            Log::debug("Yoonseo - smodulo - name {} toString {} mask {}", name(maskExpr), toString(maskExpr), mask);
+#endif
             auto signBitsExpr           = literal(signBits);
             auto reverseShiftAmountExpr = literal(reverseShiftAmount);
 
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 4e415853..b7e57a07 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -25,15 +25,19 @@
  *******************************************************************************/
 
 #include "CustomMatchers.hpp"
+#include "CustomSections.hpp"
 #include "TestContext.hpp"
+
 #include <common/CommonGraphs.hpp>
 #include <common/GEMMProblem.hpp>
 #include <common/TestValues.hpp>
 #include <common/Utilities.hpp>
 #include <common/WidenTo64bit.hpp>
+
 #include <rocRoller/CodeGen/ArgumentLoader.hpp>
 #include <rocRoller/CodeGen/MemoryInstructions.hpp>
 #include <rocRoller/CodeGen/WaitCount.hpp>
+#include <rocRoller/ExpressionTransformations.hpp>
 #include <rocRoller/KernelGraph/CoordinateGraph/Transformer.hpp>
 #include <rocRoller/KernelGraph/KernelGraph.hpp>
 #include <rocRoller/KernelGraph/Utils.hpp>
@@ -423,6 +427,7 @@ namespace AddressCalculationTest
 
                 // Compute the value 2
                 Register::ValuePtr v_value_2 = nullptr;
+
                 co_yield Expression::generate(v_value_2, widenedInput, context);
 
                 co_yield context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8);
@@ -445,7 +450,11 @@ namespace AddressCalculationTest
 
             // Note that actually large matrix hipMalloc is not needed.
             // But at the same time, larger malloc may incurr larger base address, which can lead to overflow.
-            // ?? how to get float from m_dt? gemm.m_ta?
+            //
+            // TODO: How to get "float" from DataType of gemm.tA ? DataType::Float --> "float"
+            //       (An opposite way is TypeInfo<float>::Var.dataType).
+            //       Might be alright to use float everytime. In the end, we never uses the allocated values.
+            //
             auto deviceA = make_shared_device<float /* came from DataType::Float for now.*/>();
             auto deviceB = make_shared_device<float /* came from DataType::Float for now.*/>();
             auto deviceC = make_shared_device<float /* came from DataType::Float for now.*/>();
@@ -688,14 +697,16 @@ namespace AddressCalculationTest
             indexExprPtrs = AddressTrace(m_commandKernel->getKernelGraph(), m_context)
                                 .traceComputeIndexWithBuffer();
             std::vector<Expression::ExpressionPtr> widenedExprPtrs;
-            for(int i = 0, size = indexExprPtrs.size(); i < size; i++)
-            {
-                auto eptr = indexExprPtrs[i];
-                Log::debug("== Expr : {} ", toString(eptr));
 
-                widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr));
-                Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
-            }
+            // Test only one pair of expressions (the first pair.)
+            auto eptr = indexExprPtrs[0];
+            Log::debug("== Expr : {} ", toString(eptr));
+
+            widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr));
+            Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
+
+            auto fast = Expression::FastArithmetic(m_context);
+            Log::debug("** fast : {} ", toString(fast(widenedExprPtrs.back())));
 
             auto k = m_context->kernel();
             m_context->schedule(kb_equal_one(
@@ -766,9 +777,6 @@ namespace AddressCalculationTest
                     co_yield Expression::generate(
                         v_offset, compare_res_pointer + s_ptr->expression(), m_context);
 
-                    // may not be needed.
-                    //co_yield_(Instruction::Wait(WaitCount::LGKMCnt(0, "extra waitcnt for debug")));
-
                     // boolean_diff was allocated to s[0:1]
                     // diff should be computed per lane,
                     // but is_zero_diff is actually one-bit
@@ -799,7 +807,7 @@ namespace AddressCalculationTest
                                         toString(boolType));
                         }
 
-                        // boolean_true = boolean_true | is_zero_diff
+                        // boolean_true = boolean_true & is_zero_diff
                         auto accumRes
                             = std::make_shared<Expression::Expression>(Expression::BitwiseAnd{
                                 boolean_true->expression(), is_zero_diff, "accum"});
@@ -857,28 +865,73 @@ namespace AddressCalculationTest
 
     TEST_CASE("address calculation test generate and run", "[expression][gpu]")
     {
-        // Single here means applied to all three A, B, C matrices.
-        // TODO: Add more dataTypes
-        std::vector<DataType> singleDataTypes = {DataType::Float};
-
-        for(auto dataType : singleDataTypes)
+        // Noticed that for "float" type, all different combinations of
+        // problem sizes (m, n, k) by macro tile sizes (macM, macN)
+        // will generate the same kernel instructions.
+        // This is because address calculation expressions use the same arithmetic
+        // over the same command arguments and workgroup indices, workitem indices.
+        // Only the vales of the command arguments and workgroup, workitem indices change.
+
+        // Also noticed that current expressions obtained from generation of computeIndex's
+        // VGPR base part of buffer_ instructions uses only following command arguments.
+        // Tensor_15_stride_1_14 (D's m or n)
+        // Tensor_4_stride_1_13 (C's m or n)
+        // Tensor_0_stride_1_10 (A's m )
+        // Tensor_2_stride_0_11 (B's n)
+        // No problem size "k" is involved in the expression.
+        // This can be because current expressions are only base VGPR address of buffer_
+        // instructions. The "k" part might be applied as increment.
+        // Or, it could be simply a bug.
+
+        // Called single as the one data type is applied to all A, B, C and D matrices.
+        // TODO: Add more dataTypes. Also notice other TODO in the function "setTensorArguments()",
+        //       where device pointer for matrices are allocated. Currently, only "float" is used.
+        // TODO: Debug. With DataType::Double, the test fails.
+        //       The bug is from generating a widened expression.
+        //       When it was generated, fast modulo introduces BitwiseAnd operations.
+        //       Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa.
+        //       Bitwise expression generator doesn't promote operands' datatype whereas
+        //       other binary arithmetic operations, e.g. Add, do the regType/dataType promotion.
+        auto singleDataType = GENERATE(DataType::Float);
+        //CAPTURE(singleDataType);
+        //INFO("s" << singleDataType);
+        std::cout << "singleType: " << singleDataType << "\n";
+        DYNAMIC_SECTION(singleDataType)
         {
-            for(auto [m, n, macM, macN] : TestValues::gemmProblemSizes)
+            auto [m, n, k]
+                = GENERATE(values<TestValues::GemmProblemSize>(TestValues::gemmProblemSizes));
+
+            std::cout << "problemSize m: " << m << "\n";
+            std::cout << "problemSize n: " << n << "\n";
+            std::cout << "problemSize k: " << k << "\n";
+            DYNAMIC_SECTION("ps_" << m << "x" << n << "x" << k)
             {
-                // Come up with a string from problem_size and data type, to be given to ForTestDevice();
-                auto suffixForKernelName = std::to_string(m) + "x" + std::to_string(n) + "_"
-                                           + std::to_string(macM) + "_" + std::to_string(macN);
-                auto context = TestContext::ForTestDevice({}, suffixForKernelName);
-
-                GEMMProblem                 problem{.m = m, .n = n, .macM = macM, .macN = macN};
-                rocRollerTest::Graphs::GEMM gemm(dataType);
-                gemm.setProblem(problem);
-                CAPTURE(dataType, m, n, macM, macN);
-
-                AddressCalculationTest kernel(context.get(), problem, gemm);
-                // Generate a kernel for testing address calculation and run.
-                // Verification of the result is done.
-                kernel.test_equal();
+                auto [macM, macN] = GENERATE(values(TestValues::macroTileSizes));
+                DYNAMIC_SECTION("mc_" << macM << "x" << macN)
+                {
+                    //CAPTURE(problemSize);
+                    //INFO("p " << problemSize);
+                    std::cout << "macM: " << macM << "\n";
+                    std::cout << "macN: " << macN << "\n";
+
+                    // Come up with a string from problem_size and data type, to be given to ForTestDevice();
+                    auto probSizeString
+                        = std::to_string(m) + "x" + std::to_string(n) + "x" + std::to_string(k);
+                    auto macroTileString = std::to_string(macM) + "x" + std::to_string(macN);
+                    auto suffixForKernelName
+                        = toString(singleDataType) + "_" + probSizeString + "_" + macroTileString;
+                    auto context = TestContext::ForTestDevice({}, suffixForKernelName);
+
+                    GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN};
+                    rocRollerTest::Graphs::GEMM gemm(singleDataType);
+                    gemm.setProblem(problem);
+
+                    AddressCalculationTest kernel(context.get(), problem, gemm);
+
+                    // Generate a kernel for testing address calculation and run.
+                    // Verification of the result is done.
+                    kernel.test_equal();
+                }
             }
         }
     }
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index f660e1a0..f92e8d3d 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -27,10 +27,12 @@
 #pragma once
 
 //
-// Value "suites" for arithemtic and expression tests
+// Value "suites" for arithmetic and expression tests
 //
 
 #include <cstdint>
+#include <initializer_list>
+#include <utility>
 #include <vector>
 
 #include <rocRoller/Expression.hpp>
@@ -139,15 +141,20 @@ namespace TestValues
                                                42e5};
 
     // Portions of GEMMProblem
-    struct gemmProblemSize
+    struct GemmProblemSize
     {
         int m;
         int n;
-        int macM;
-        int macN;
+        int k;
     };
 
-    inline std::vector<gemmProblemSize> gemmProblemSizes = {{128, 128, 64, 64}, {512, 512, 64, 64}};
+    inline std::initializer_list<GemmProblemSize> gemmProblemSizes
+        = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}};
+
+    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}};
+    // Looks like {128, 128} fails
+    // {{64, 64}, {128, 128}};
+    // Also with the current set up - datatype "double" fails.
 
     template <typename T>
     struct ByType

From 5819487368c74e1ee0a9fc00f489778a8b30cd44 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Mon, 17 Mar 2025 21:23:22 +0000
Subject: [PATCH 22/32] Changes with bitwiseand

---
 lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp | 38 ++++++++++++++++++--
 test/catch/AddressCalculationTest.cpp        |  9 ++++-
 test/common/common/TestValues.hpp            |  3 +-
 3 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp
index bfed647f..6ac110c6 100644
--- a/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp
+++ b/lib/source/CodeGen/Arithmetic/BitwiseAnd.cpp
@@ -52,9 +52,26 @@ namespace rocRoller
         AssertFatal(lhs != nullptr);
         AssertFatal(rhs != nullptr);
 
-        auto elementBits = std::max({DataTypeInfo::Get(dest->variableType()).elementBits,
-                                     DataTypeInfo::Get(lhs->variableType()).elementBits,
-                                     DataTypeInfo::Get(rhs->variableType()).elementBits});
+        auto destNumBits = DataTypeInfo::Get(dest->variableType()).elementBits;
+        auto lhsNumBits = DataTypeInfo::Get(lhs->variableType()).elementBits;
+        auto rhsNumBits = DataTypeInfo::Get(rhs->variableType()).elementBits;
+
+        auto elementBits = std::max({destNumBits, lhsNumBits, rhsNumBits});
+
+        // Assertions on number of bits.
+        if(elementBits <= 32u)
+        {
+            // We don't support BitwiseAnd of less than 32-bit, 
+            // gfx9 doesn't have s_and or v_and less than b32.
+            AssertFatal (lhsNumBits == rhsNumBits && lhsNumBits == 32, 
+                "For 32-bit BitwiseAnd, both operands should be 32-bit wide.");
+        } else if (elementBits == 64u) {
+            // I am not aware a path that can lead to destNumbBits being smaller than 64-bit when the control reached here.
+            AssertFatal (destNumBits == 64u);   
+        } else {
+            Throw<FatalError>("Unsupported elementBits for bitwiseAnd operation:: ",
+                ShowValue(elementBits));
+        }
 
         if(dest->regType() == Register::Type::Scalar)
         {
@@ -82,6 +99,20 @@ namespace rocRoller
             }
             else if(elementBits == 64u)
             {
+                Register::ValuePtr l0, l1, r0, r1;
+                if (lhs->regType() == Register::Type::Scalar) {
+                    co_yield get2DwordsScalar(l0, l1, lhs);
+                } else {
+                    co_yield get2DwordsVector(l0, l1, lhs);
+                }
+                co_yield get2DwordsVector(r0, r1, rhs);
+
+                co_yield_(Instruction(
+                    "v_and_b32", {dest->subset({0})}, {l0, r0}, {}, ""));
+                co_yield_(Instruction(
+                    "v_and_b32", {dest->subset({1})}, {l1, r1}, {}, ""));
+                
+#if 0
                 if(lhs->regType() == Register::Type::Literal)
                 {
                     Register::ValuePtr lsb;
@@ -107,6 +138,7 @@ namespace rocRoller
                                           {},
                                           ""));
                 }
+#endif
             }
             else
             {
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index b7e57a07..5564ec6c 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -863,6 +863,7 @@ namespace AddressCalculationTest
         std::vector<uint64_t>                    m_hostBuffer2;
     };
 
+#if 0
     TEST_CASE("address calculation test generate and run", "[expression][gpu]")
     {
         // Noticed that for "float" type, all different combinations of
@@ -892,7 +893,11 @@ namespace AddressCalculationTest
         //       Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa.
         //       Bitwise expression generator doesn't promote operands' datatype whereas
         //       other binary arithmetic operations, e.g. Add, do the regType/dataType promotion.
+#if 0
         auto singleDataType = GENERATE(DataType::Float);
+#else
+        auto singleDataType = GENERATE(DataType::Float, DataType::Double);
+#endif
         //CAPTURE(singleDataType);
         //INFO("s" << singleDataType);
         std::cout << "singleType: " << singleDataType << "\n";
@@ -935,7 +940,8 @@ namespace AddressCalculationTest
             }
         }
     }
-
+#endif
+#if 1
     TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "128x128_one_pair");
@@ -971,4 +977,5 @@ namespace AddressCalculationTest
         AddressCalculationTest kernel(context.get(), problem, gemm);
         kernel.test_sanity_indices();
     }
+#endif
 }
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index f92e8d3d..90f31835 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -151,7 +151,8 @@ namespace TestValues
     inline std::initializer_list<GemmProblemSize> gemmProblemSizes
         = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}};
 
-    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}};
+    // inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}};
+    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{128, 128}};
     // Looks like {128, 128} fails
     // {{64, 64}, {128, 128}};
     // Also with the current set up - datatype "double" fails.

From 334635898d82a07d50b7872214a111ec4bda54b7 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Mon, 17 Mar 2025 23:02:08 +0000
Subject: [PATCH 23/32] Fix compile-errors after merge from origin/main

---
 test/catch/AddressCalculationTest.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 5564ec6c..c8e72511 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -129,10 +129,8 @@ namespace AddressCalculationTest
             // fastArithmetic is being used eventually when the expressions are generated.
             auto identity_transducer = [&](auto expr) { return expr; };
             auto coords              = CoordG::Transformer(
-                std::make_shared<rocRoller::KernelGraph::CoordinateGraph::CoordinateGraph>(
-                    m_kGraph.coordinates),
-                m_context,
-                identity_transducer);
+
+                &(m_kGraph.coordinates), identity_transducer);
 
             auto fullStop  = [&](int tag) { return tag == increment; };
             auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream;
@@ -153,7 +151,7 @@ namespace AddressCalculationTest
             // Compute an offset address if we don't have an
             // associated base address to inherit from
             {
-                // base < 0 by the time control reacheds here.
+                // base < 0 by the time control reached here.
                 auto indexExpr
                     = ci.forward ? coords.forward({target})[0] : coords.reverse({target})[0];
 
@@ -802,9 +800,8 @@ namespace AddressCalculationTest
                         auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
                         {
                             auto boolType = resultVariableType(is_zero_diff).dataType;
-                            AssertFatal(boolType == DataType::Bool64,
-                                        "is_zero type {}",
-                                        toString(boolType));
+                            AssertFatal(
+                                boolType == DataType::Bool64, "is_zero type: ", toString(boolType));
                         }
 
                         // boolean_true = boolean_true & is_zero_diff
@@ -863,7 +860,7 @@ namespace AddressCalculationTest
         std::vector<uint64_t>                    m_hostBuffer2;
     };
 
-#if 0
+#if 1
     TEST_CASE("address calculation test generate and run", "[expression][gpu]")
     {
         // Noticed that for "float" type, all different combinations of
@@ -941,7 +938,7 @@ namespace AddressCalculationTest
         }
     }
 #endif
-#if 1
+#if 0
     TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "128x128_one_pair");

From 064d8e7db72e314d25d07f38b1e00b6afbbbe01d Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Tue, 18 Mar 2025 19:15:32 +0000
Subject: [PATCH 24/32] clean ups and for debugging

---
 test/catch/AddressCalculationTest.cpp | 9 +++++++--
 test/common/common/TestValues.hpp     | 3 +--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index c8e72511..34999f30 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -795,8 +795,13 @@ namespace AddressCalculationTest
                         // Compute the value 2
                         Register::ValuePtr v_value_2 = nullptr;
                         co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context);
-
-                        // Compute diff (value_1 - value_2)
+#if 1
+                        std::cout << "v_value_1 resultType: " << rocRoller::Expression::resultVariableType(v_value_1) << "\n";
+                        std::cout << (v_value_1->toString()) << " " << (v_value_1->description()) <<  "\n";
+                        std::cout << "v_value_2 resultType: " << rocRoller::Expression::resultVariableType(v_value_2) << "\n";
+                        std::cout << (v_value_2->toString()) << " " << (v_value_2->description()) <<  "\n";
+#endif
+                        // Compute diff (value_1 == value_2)
                         auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
                         {
                             auto boolType = resultVariableType(is_zero_diff).dataType;
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index 90f31835..b73ba254 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -151,8 +151,7 @@ namespace TestValues
     inline std::initializer_list<GemmProblemSize> gemmProblemSizes
         = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}};
 
-    // inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}};
-    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{128, 128}};
+    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}, {128, 128}};
     // Looks like {128, 128} fails
     // {{64, 64}, {128, 128}};
     // Also with the current set up - datatype "double" fails.

From 8b2d8bf2efa86354768c3697bcdf75df369c62fd Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 19 Mar 2025 01:26:50 +0000
Subject: [PATCH 25/32] Fixing logical errors in AddressCalculationTest after
 merging Scott's big PR

---
 .../KernelGraph/RegisterTagManager_impl.hpp   |  5 +++++
 test/catch/AddressCalculationTest.cpp         | 19 +++++--------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp
index dffd2fd2..7efbc999 100644
--- a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp
+++ b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp
@@ -122,8 +122,13 @@ namespace rocRoller
 
     inline void RegisterTagManager::addRegister(int tag, Register::ValuePtr value)
     {
+
         AssertFatal(!hasExpression(tag), "Tag already associated with an expression");
+#if 0
+        // Assertion fails on AddressCalculationTest - s2, s3 for workgroupIndex X and Y.
         AssertFatal(!hasRegister(tag), "Tag ", tag, " already in RegisterTagManager.");
+#endif
+
         if(auto existingTag = findRegister(value))
         {
             AssertFatal(value->readOnly(),
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 34999f30..eda30178 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -131,6 +131,7 @@ namespace AddressCalculationTest
             auto coords              = CoordG::Transformer(
 
                 &(m_kGraph.coordinates), identity_transducer);
+            coords.fillExecutionCoordinates(m_context);
 
             auto fullStop  = [&](int tag) { return tag == increment; };
             auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream;
@@ -795,12 +796,7 @@ namespace AddressCalculationTest
                         // Compute the value 2
                         Register::ValuePtr v_value_2 = nullptr;
                         co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context);
-#if 1
-                        std::cout << "v_value_1 resultType: " << rocRoller::Expression::resultVariableType(v_value_1) << "\n";
-                        std::cout << (v_value_1->toString()) << " " << (v_value_1->description()) <<  "\n";
-                        std::cout << "v_value_2 resultType: " << rocRoller::Expression::resultVariableType(v_value_2) << "\n";
-                        std::cout << (v_value_2->toString()) << " " << (v_value_2->description()) <<  "\n";
-#endif
+
                         // Compute diff (value_1 == value_2)
                         auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
                         {
@@ -865,7 +861,6 @@ namespace AddressCalculationTest
         std::vector<uint64_t>                    m_hostBuffer2;
     };
 
-#if 1
     TEST_CASE("address calculation test generate and run", "[expression][gpu]")
     {
         // Noticed that for "float" type, all different combinations of
@@ -895,11 +890,9 @@ namespace AddressCalculationTest
         //       Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa.
         //       Bitwise expression generator doesn't promote operands' datatype whereas
         //       other binary arithmetic operations, e.g. Add, do the regType/dataType promotion.
-#if 0
-        auto singleDataType = GENERATE(DataType::Float);
-#else
+
         auto singleDataType = GENERATE(DataType::Float, DataType::Double);
-#endif
+
         //CAPTURE(singleDataType);
         //INFO("s" << singleDataType);
         std::cout << "singleType: " << singleDataType << "\n";
@@ -942,8 +935,7 @@ namespace AddressCalculationTest
             }
         }
     }
-#endif
-#if 0
+
     TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "128x128_one_pair");
@@ -979,5 +971,4 @@ namespace AddressCalculationTest
         AddressCalculationTest kernel(context.get(), problem, gemm);
         kernel.test_sanity_indices();
     }
-#endif
 }

From b0899d1675ba66e4c23fb402d515841f1c15cb48 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 19 Mar 2025 01:45:28 +0000
Subject: [PATCH 26/32] Remove std::cout and add CAPTURE

---
 test/catch/AddressCalculationTest.cpp | 153 +++++++++++---------------
 1 file changed, 66 insertions(+), 87 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index eda30178..0afbaa1f 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -622,16 +622,12 @@ namespace AddressCalculationTest
 
             for(int i = 0, size = m_hostBuffer.size(); i < m_totalWorkitemCount; i++)
             {
-                // For 128 by 128 output matrix, workgroupCount computed is {512, 2, 1}
-                if(toString(m_hostBuffer[i]) != host_x || toString(m_hostBuffer2[i]) != host_y)
-                {
-                    std::cout
-                        << "workitemCount.x and workitemCount.y in kernel for global workitem " << i
-                        << ": " << m_hostBuffer[i] << ", " << m_hostBuffer2[i] << "\n";
-                    std::cout << "workitemCount.x and workitemCount.y in host for global workitem "
-                              << i << ": " << host_x << ", " << host_y << "\n";
-                }
+                // workitemCount.x in device and host.
+                CAPTURE(toString(m_hostBuffer[i]), host_x, i);
                 CHECK(toString(m_hostBuffer[i]) == host_x);
+
+                // workitemCount.y in device and host.
+                CAPTURE(toString(m_hostBuffer2[i]), host_y, i);
                 CHECK(toString(m_hostBuffer2[i]) == host_y);
             }
         }
@@ -752,77 +748,74 @@ namespace AddressCalculationTest
             auto allone_uint64
                 = std::make_shared<Expression::Expression>(static_cast<uint64_t>(0xFFFFFFFF));
 
-            auto kb = [&]()
-                // return [context, input, widenedInput, workitemCount, workgroupSize, allone_uint64]()
-                -> Generator<Instruction> {
-                    // store base addr
-                    Register::ValuePtr s_ptr;
+            // context, input, widenedInput, workitemCount, workgroupSize, allone_uint64 are used
+            // inside the kb.
+            auto kb = [&]() -> Generator<Instruction> {
+                // store base addr
+                Register::ValuePtr s_ptr;
 
-                    co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr);
+                co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr);
 
-                    Register::ValuePtr s_ptr2;
+                Register::ValuePtr s_ptr2;
 
-                    co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2);
+                co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2);
 
-                    // 2-D
-                    auto compare_res_pointer = get64BitVectorOffset(
-                        m_context,
-                        m_context->kernel()->workitemCount(),
-                        (m_commandKernel->getCommandParameters()->getManualWorkgroupSize())
-                            .value());
-                    Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+                // 2-D
+                auto compare_res_pointer = get64BitVectorOffset(
+                    m_context,
+                    m_context->kernel()->workitemCount(),
+                    (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value());
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
 
-                    Register::ValuePtr v_offset = nullptr;
-                    co_yield Expression::generate(
-                        v_offset, compare_res_pointer + s_ptr->expression(), m_context);
+                Register::ValuePtr v_offset = nullptr;
+                co_yield Expression::generate(
+                    v_offset, compare_res_pointer + s_ptr->expression(), m_context);
 
-                    // boolean_diff was allocated to s[0:1]
-                    // diff should be computed per lane,
-                    // but is_zero_diff is actually one-bit
-                    auto boolean_true = Register::Value::WavefrontPlaceholder(m_context);
-                    Register::ValuePtr v_allone;
-                    co_yield Expression::generate(v_allone, allone_uint64, m_context);
+                // boolean_diff was allocated to s[0:1]
+                // diff should be computed per lane,
+                // but is_zero_diff is actually one-bit
+                auto               boolean_true = Register::Value::WavefrontPlaceholder(m_context);
+                Register::ValuePtr v_allone;
+                co_yield Expression::generate(v_allone, allone_uint64, m_context);
 
-                    co_yield m_context->copier()->copy(
-                        boolean_true, v_allone, "set to true for all lanes");
+                co_yield m_context->copier()->copy(
+                    boolean_true, v_allone, "set to true for all lanes");
 
-                    Register::ValuePtr temp_res;
-                    for(int i = 0, size = indexExprPtrs.size(); i < size; i++)
+                Register::ValuePtr temp_res;
+                for(int i = 0, size = indexExprPtrs.size(); i < size; i++)
+                {
+                    // Compute the value 1
+                    Register::ValuePtr v_value_1 = nullptr;
+                    co_yield Expression::generate(v_value_1, indexExprPtrs[i], m_context);
+
+                    // Compute the value 2
+                    Register::ValuePtr v_value_2 = nullptr;
+                    co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context);
+
+                    // Compute diff (value_1 == value_2)
+                    auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
                     {
-                        // Compute the value 1
-                        Register::ValuePtr v_value_1 = nullptr;
-                        co_yield Expression::generate(v_value_1, indexExprPtrs[i], m_context);
-
-                        // Compute the value 2
-                        Register::ValuePtr v_value_2 = nullptr;
-                        co_yield Expression::generate(v_value_2, widenedExprPtrs[i], m_context);
-
-                        // Compute diff (value_1 == value_2)
-                        auto is_zero_diff = v_value_1->expression() == v_value_2->expression();
-                        {
-                            auto boolType = resultVariableType(is_zero_diff).dataType;
-                            AssertFatal(
-                                boolType == DataType::Bool64, "is_zero type: ", toString(boolType));
-                        }
-
-                        // boolean_true = boolean_true & is_zero_diff
-                        auto accumRes
-                            = std::make_shared<Expression::Expression>(Expression::BitwiseAnd{
-                                boolean_true->expression(), is_zero_diff, "accum"});
-                        {
-                            auto accumType = resultVariableType(accumRes).dataType;
-                            AssertFatal(accumType == DataType::Bool64,
-                                        "accum type {}",
-                                        toString(accumType));
-                        }
-                        co_yield Expression::generate(boolean_true, accumRes, m_context);
+                        auto boolType = resultVariableType(is_zero_diff).dataType;
+                        AssertFatal(
+                            boolType == DataType::Bool64, "is_zero type: ", toString(boolType));
                     }
 
-                    auto v_value = Register::Value::Placeholder(
-                        m_context, Register::Type::Vector, DataType::UInt64, 1);
-                    co_yield m_context->copier()->copy(v_value, boolean_true, "Move value");
-                    co_yield m_context->mem()->storeGlobal(v_offset, v_value, 0, 8);
-                };
+                    // boolean_true = boolean_true & is_zero_diff
+                    auto accumRes = std::make_shared<Expression::Expression>(
+                        Expression::BitwiseAnd{boolean_true->expression(), is_zero_diff, "accum"});
+                    {
+                        auto accumType = resultVariableType(accumRes).dataType;
+                        AssertFatal(
+                            accumType == DataType::Bool64, "accum type {}", toString(accumType));
+                    }
+                    co_yield Expression::generate(boolean_true, accumRes, m_context);
+                }
+
+                auto v_value = Register::Value::Placeholder(
+                    m_context, Register::Type::Vector, DataType::UInt64, 1);
+                co_yield m_context->copier()->copy(v_value, boolean_true, "Move value");
+                co_yield m_context->mem()->storeGlobal(v_offset, v_value, 0, 8);
+            };
 
             m_context->schedule(kb());
 
@@ -834,11 +827,8 @@ namespace AddressCalculationTest
 
             for(int i = 0, size = m_hostBuffer.size(); i < size; i++)
             {
-                if(m_hostBuffer[i] != 0xFFFFFFFF)
-                {
-                    std::cout << "The addresses are not same at " << i << " " << m_hostBuffer[i]
-                              << "\n";
-                }
+                // The two addresses are not same at i.
+                CAPTURE(i, m_hostBuffer[i]);
                 CHECK(m_hostBuffer[i] == 0xFFFFFFFF);
             }
 
@@ -882,19 +872,10 @@ namespace AddressCalculationTest
         // Or, it could be simply a bug.
 
         // Called single as the one data type is applied to all A, B, C and D matrices.
-        // TODO: Add more dataTypes. Also notice other TODO in the function "setTensorArguments()",
-        //       where device pointer for matrices are allocated. Currently, only "float" is used.
-        // TODO: Debug. With DataType::Double, the test fails.
-        //       The bug is from generating a widened expression.
-        //       When it was generated, fast modulo introduces BitwiseAnd operations.
-        //       Those bitwiseand expressions may contain 64bit lhs, 32bit rhs vice versa.
-        //       Bitwise expression generator doesn't promote operands' datatype whereas
-        //       other binary arithmetic operations, e.g. Add, do the regType/dataType promotion.
+        // TODO: check transpose
 
         auto singleDataType = GENERATE(DataType::Float, DataType::Double);
 
-        //CAPTURE(singleDataType);
-        //INFO("s" << singleDataType);
         std::cout << "singleType: " << singleDataType << "\n";
         DYNAMIC_SECTION(singleDataType)
         {
@@ -909,8 +890,6 @@ namespace AddressCalculationTest
                 auto [macM, macN] = GENERATE(values(TestValues::macroTileSizes));
                 DYNAMIC_SECTION("mc_" << macM << "x" << macN)
                 {
-                    //CAPTURE(problemSize);
-                    //INFO("p " << problemSize);
                     std::cout << "macM: " << macM << "\n";
                     std::cout << "macN: " << macN << "\n";
 
@@ -971,4 +950,4 @@ namespace AddressCalculationTest
         AddressCalculationTest kernel(context.get(), problem, gemm);
         kernel.test_sanity_indices();
     }
-}
+}
\ No newline at end of file

From 421daf5d3f8e70b16a52908bc7b7f20302cd2363 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 19 Mar 2025 21:06:42 +0000
Subject: [PATCH 27/32] Changing/Remove codes after merging from origin/main

---
 test/catch/AddressCalculationTest.cpp |   6 +-
 test/common/WidenTo64bit.cpp          | 274 --------------------------
 test/common/common/WidenTo64bit.hpp   |  12 --
 3 files changed, 3 insertions(+), 289 deletions(-)
 delete mode 100644 test/common/WidenTo64bit.cpp
 delete mode 100644 test/common/common/WidenTo64bit.hpp

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 0afbaa1f..24f4bac5 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -32,7 +32,7 @@
 #include <common/GEMMProblem.hpp>
 #include <common/TestValues.hpp>
 #include <common/Utilities.hpp>
-#include <common/WidenTo64bit.hpp>
+#include <common/WidenAddrExprTo64bit.hpp>
 
 #include <rocRoller/CodeGen/ArgumentLoader.hpp>
 #include <rocRoller/CodeGen/MemoryInstructions.hpp>
@@ -697,7 +697,7 @@ namespace AddressCalculationTest
             auto eptr = indexExprPtrs[0];
             Log::debug("== Expr : {} ", toString(eptr));
 
-            widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr));
+            widenedExprPtrs.push_back(rocRollerTest::widenAddrExprTo64bit(eptr));
             Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
 
             auto fast = Expression::FastArithmetic(m_context);
@@ -741,7 +741,7 @@ namespace AddressCalculationTest
                 auto eptr = indexExprPtrs[i];
                 Log::debug("== Expr : {} ", toString(eptr));
 
-                widenedExprPtrs.push_back(rocRollerTest::widenTo64bit(eptr));
+                widenedExprPtrs.push_back(rocRollerTest::widenAddrExprTo64bit(eptr));
                 Log::debug("++ Widen : {} ", toString(widenedExprPtrs.back()));
             }
 
diff --git a/test/common/WidenTo64bit.cpp b/test/common/WidenTo64bit.cpp
deleted file mode 100644
index dfb21645..00000000
--- a/test/common/WidenTo64bit.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-#include <common/WidenTo64bit.hpp>
-#include <rocRoller/AssemblyKernelArgument.hpp>
-#include <rocRoller/Expression.hpp>
-
-template <typename T>
-constexpr auto cast_to_unsigned(T val)
-{
-    return static_cast<typename std::make_unsigned<T>::type>(val);
-}
-
-namespace rocRollerTest
-{
-
-    using namespace rocRoller;
-
-    struct WidenTo64BitVisitor
-    {
-
-        Expression::ExpressionPtr operator()(Expression::Convert const& expr) const
-        {
-            Expression::Convert cpy = expr;
-            if(expr.arg)
-            {
-                // Here is an assumption that call(cpy.arg) never goes above 64-bit and
-                // input convert's destination types are either int32, uint32, int64 or uint64.
-                // resultVaribleType(expr.arg) is not called intentionally as it will visit the
-                // subtree of expr.arg again.
-                // We can also import similar logic from ExpressionResultTypeVisitor and
-                // make the operator()(...) return a pair of Expression::ExpressionPtr and VariableType
-                // in order to avoid repeated visit of the subtree of cpy.arg.
-                cpy.arg = call(expr.arg);
-                if(expr.destinationType == DataType::UInt32)
-                    return convert(DataType::UInt64, cpy.arg);
-                else if(expr.destinationType == DataType::Int32)
-                    return convert(DataType::Int64, cpy.arg);
-                else if(expr.destinationType == DataType::UInt64
-                        || expr.destinationType == DataType::Int64)
-                    return convert(expr.destinationType, cpy.arg);
-                else
-                {
-                    AssertFatal(false,
-                                "Expected Destination type for a convert is either (u)int{32,64}");
-                    return nullptr;
-                }
-            }
-
-            return std::make_shared<Expression::Expression>(cpy);
-        }
-
-        Expression::ExpressionPtr operator()(Expression::Negate const& expr) const
-        {
-            Expression::Negate cpy = expr;
-            if(expr.arg)
-            {
-                cpy.arg = call(expr.arg);
-            }
-
-            return std::make_shared<Expression::Expression>(cpy);
-        }
-
-        template <Expression::CUnary Expr>
-        Expression::ExpressionPtr operator()(Expr const& expr) const
-        {
-            // Only Expression::Convert and perhaps negates are expected.
-            AssertFatal(false, "Unexpected Unary expression", ShowValue(expr));
-            return nullptr;
-        }
-
-        Expression::ExpressionPtr operator()(Expression::Divide const& expr) const
-        {
-            // TODO: Add check that divisor is a CValue within (u)int32-bit
-            Log::debug("Divisor: {} ", toString(expr.rhs));
-	    AssertFatal(isExpectedLeafType(expr.rhs));
-
-            Expression::Divide cpy = expr;
-            if(expr.lhs)
-                cpy.lhs = call(expr.lhs);
-
-            return std::make_shared<Expression::Expression>(cpy);
-        }
-
-        Expression::ExpressionPtr operator()(Expression::Modulo const& expr) const
-        {
-            // TODO: Add check that divisor is a CValue within (u)int32-bit
-            Log::debug("Modulo: {} ", toString(expr.rhs));
-	    AssertFatal(isExpectedLeafType(expr.rhs));
-
-            Expression::Modulo cpy = expr;
-            if(expr.lhs)
-                cpy.lhs = call(expr.lhs);
-
-            return std::make_shared<Expression::Expression>(cpy);
-        }
-
-        // TODO: for some shifts, logical, subtract operations these may not be correct.
-        //       Not sure yet if address calculation expressions with identity_transducer
-        //       those types of Expression or not.
-        template <Expression::CBinary Expr>
-        requires(Expression::CArithmetic<Expr>) Expression::ExpressionPtr
-            operator()(Expr const& expr) const
-        {
-            if constexpr(std::same_as<Expr, Expression::Subtract>)
-            {
-                AssertFatal(false, "Subtracts are not expected");
-                return nullptr;
-            }
-
-            if constexpr(Expression::CLogical<Expr>)
-            {
-                AssertFatal(false, "logicals are not expected");
-                return nullptr;
-            }
-
-            Expr cpy = expr;
-            if(expr.lhs)
-                cpy.lhs = call(expr.lhs);
-            if(expr.rhs)
-                cpy.rhs = call(expr.rhs);
-
-            return std::make_shared<Expression::Expression>(cpy);
-        }
-
-        // Even with catch-all operator()(Expression::ExpressionPtr) without following,
-        // compilation fails.
-        template <typename Expr>
-        requires(Expression::CBinary<Expr>) Expression::ExpressionPtr
-            operator()(Expr const& expr) const
-        {
-            AssertFatal(false, "Not expected expr : ", ShowValue(expr));
-            return nullptr;
-        }
-
-        template <Expression::CTernary Expr>
-        requires(Expression::CArithmetic<Expr>) Expression::ExpressionPtr
-            operator()(Expr const& expr) const
-        {
-            Expr cpy = expr;
-            if(expr.lhs)
-                cpy.lhs = call(expr.lhs);
-            if(expr.r1hs)
-                cpy.r1hs = call(expr.r1hs);
-            if(expr.r2hs)
-                cpy.r2hs = call(expr.r2hs);
-
-            return std::make_shared<Expression::Expression>(cpy);
-        }
-
-        template <typename Expr>
-        requires(Expression::CTernary<Expr>) Expression::ExpressionPtr
-            operator()(Expr const& expr) const
-        {
-            AssertFatal(false, "Not expected expr : ", ShowValue(expr));
-            return nullptr;
-        }
-
-        // leaves
-        Expression::ExpressionPtr operator()(CommandArgumentPtr const& expr) const
-        {
-            Log::debug("CommandArgumentPtr {}", Expression::toString(expr));
-
-            auto varType = expr->variableType();
-
-            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
-
-            CommandArgumentPtr cpy = expr;
-            return widenTo64(varType.dataType, cpy);
-        }
-
-        Expression::ExpressionPtr operator()(CommandArgumentValue const& expr) const
-        {
-            Log::debug("CommandArgumentValue {} Type {} ",
-                       Expression::toString(expr),
-                       toString(variableType(expr)));
-
-            auto varType = variableType(expr);
-
-            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
-
-            CommandArgumentValue cpy = expr;
-            return widenTo64(varType.dataType, cpy);
-        }
-
-        Expression::ExpressionPtr operator()(Register::ValuePtr const& expr) const
-        {
-            Log::debug("Register::ValuePtr {}", Expression::toString(expr));
-
-            auto varType = expr->variableType();
-
-            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
-
-            Register::ValuePtr cpy = expr;
-            return widenTo64(varType.dataType, cpy);
-        }
-
-        Expression::ExpressionPtr operator()(AssemblyKernelArgumentPtr const& expr) const
-        {
-            Log::debug("AssemblyKernelArgumentPtr {} its expression is {}",
-                       Expression::toString(expr),
-                       Expression::toString(expr->expression));
-
-            auto varType = expr->variableType;
-
-            assertIfNotExpectedType(varType.dataType, Expression::toString(expr));
-
-            AssemblyKernelArgumentPtr cpy = expr;
-            return widenTo64(varType.dataType, cpy);
-        }
-
-        // catch the rest CValue
-        template <Expression::CValue Value>
-        Expression::ExpressionPtr operator()(Value const& expr) const
-        {
-            AssertFatal(
-                false, "No expectation to meet WaveTilePtr or DataFlowTag : ", ShowValue(expr));
-            return nullptr;
-        }
-
-        Expression::ExpressionPtr operator()(Expression::Expression const& expr) const
-        {
-            AssertFatal(false, "No expectation to meet this type of Expression: ", ShowValue(expr));
-            return nullptr;
-        }
-
-        template <typename T>
-        Expression::ExpressionPtr widenTo64(DataType srcType, T const& expr) const
-        {
-            if(srcType == DataType::UInt32)
-                return convert(DataType::UInt64, std::make_shared<Expression::Expression>(expr));
-            else if(srcType == DataType::Int32)
-                return convert(DataType::Int64, std::make_shared<Expression::Expression>(expr));
-
-            return std::make_shared<Expression::Expression>(expr);
-        }
-
-        Expression::ExpressionPtr call(Expression::ExpressionPtr const& expr) const
-        {
-            return std::visit(*this, *expr);
-        }
-
-        void assertIfNotExpectedType(DataType dt, std::string const& showValue) const
-        {
-            AssertFatal(dt == DataType::Int32 || dt == DataType::UInt32 || dt == DataType::UInt64
-                            || dt == DataType::Int64,
-                        "Unexpected DataType for Command/Kernel arguments or "
-                        "workgroup/item indices ",
-                        showValue);
-        }
-
-	bool isExpectedLeafType(Expression::ExpressionPtr const& expr) const {
-            return std::holds_alternative<CommandArgumentValue>(*expr) ||
-                   std::holds_alternative<CommandArgumentPtr>(*expr) ||
-                   std::holds_alternative<Register::ValuePtr>(*expr) ||
-                   std::holds_alternative<AssemblyKernelArgumentPtr>(*expr);
-        }
-	
-    };
-
-    Expression::ExpressionPtr widenTo64bit(Expression::ExpressionPtr expr)
-    {
-        auto origVarType = resultVariableType(expr);
-
-        auto visitor = WidenTo64BitVisitor();
-        auto widened = visitor.call(expr);
-
-        auto finalVarType = resultVariableType(widened);
-
-        AssertFatal(origVarType.dataType == finalVarType.dataType,
-                    "Original and final data types should be the same",
-                    ShowValue(origVarType.dataType),
-                    ShowValue(finalVarType.dataType));
-
-        return widened;
-    }
-}
diff --git a/test/common/common/WidenTo64bit.hpp b/test/common/common/WidenTo64bit.hpp
deleted file mode 100644
index 33a84085..00000000
--- a/test/common/common/WidenTo64bit.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <rocRoller/Expression.hpp>
-namespace rocRollerTest {
-/**
-         * @brief Widen (u)int32 to (u)int64.
-         *
-         * Has many assumptions in input expr. See the implementation for details.
-         *
-         * @param expr Input expression
-         * @return ExpressionPtr Transformed expression
-         */
-	rocRoller::Expression::ExpressionPtr widenTo64bit(rocRoller::Expression::ExpressionPtr expr);
-}

From 3be55657305b82c423e7152810ff3f91d172af65 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Wed, 19 Mar 2025 22:39:10 +0000
Subject: [PATCH 28/32] Move kerner bodies into test

---
 test/catch/AddressCalculationTest.cpp | 284 ++++++++++++--------------
 1 file changed, 131 insertions(+), 153 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 24f4bac5..3515bc5d 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -305,135 +305,6 @@ namespace AddressCalculationTest
             return compare_res_pointer;
         }
 
-        // This is for printing out workgroup index and thread index.
-        auto kb_sanity_indices(ContextPtr                                      context,
-                               std::array<Expression::ExpressionPtr, 3> const& workitemCount,
-                               std::array<unsigned int, 3> const&              workgroupSize)
-        {
-            return [context, workitemCount, workgroupSize]() -> Generator<Instruction> {
-                // store base addr
-                Register::ValuePtr s_ptr;
-                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
-
-                Register::ValuePtr s_ptr2;
-                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
-
-                auto compare_res_pointer
-                    = get64BitVectorOffset(context, workitemCount, workgroupSize);
-                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
-
-                Register::ValuePtr v_offset_1 = nullptr;
-                co_yield Expression::generate(
-                    v_offset_1, compare_res_pointer + s_ptr->expression(), context);
-
-                Register::ValuePtr v_offset_2 = nullptr;
-                co_yield Expression::generate(
-                    v_offset_2, compare_res_pointer + s_ptr2->expression(), context);
-
-                // workgroupIndex x
-                auto v_wg_x = Register::Value::Placeholder(
-                    context, Register::Type::Vector, DataType::UInt32, 1);
-                co_yield context->copier()->copy(
-                    v_wg_x, context->kernel()->workgroupIndex()[0], "copy wgi.x to v");
-                co_yield context->mem()->storeGlobal(v_offset_1, v_wg_x, 0, 4);
-
-                co_yield context->mem()->storeGlobal(
-                    v_offset_2, (context->kernel()->workitemIndex())[0], 0, 4);
-            };
-        }
-
-        // Just print out passed expressions to device memory.
-        // If passed expressions are workitemCounts, the expectation is that
-        // the generated expressions' values are the same with the host-side computed values.
-        auto kb_implicit_workitemcount(ContextPtr                         context,
-                                       Expression::ExpressionPtr const&   workitemcount_X,
-                                       Expression::ExpressionPtr const&   workitemcount_Y,
-                                       std::array<unsigned int, 3> const& workgroupSize)
-        {
-            return [context,
-                    workitemcount_X,
-                    workitemcount_Y,
-                    workgroupSize]() -> Generator<Instruction> {
-                // store base addrs
-                Register::ValuePtr s_ptr;
-                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
-                Register::ValuePtr s_ptr2;
-                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
-                auto compare_res_pointer = get64BitVectorOffset(
-                    context, {workitemcount_X, workitemcount_Y}, workgroupSize);
-                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
-
-                Register::ValuePtr v_offset_1 = nullptr;
-                co_yield Expression::generate(
-                    v_offset_1, compare_res_pointer + s_ptr->expression(), context);
-
-                Register::ValuePtr v_offset_2 = nullptr;
-                co_yield Expression::generate(
-                    v_offset_2, compare_res_pointer + s_ptr2->expression(), context);
-
-                // Compute the value 1
-                Register::ValuePtr s_value_1 = nullptr;
-                co_yield Expression::generate(s_value_1, workitemcount_X, context);
-                auto v_value_11 = Register::Value::Placeholder(
-                    context, Register::Type::Vector, DataType::Int64, 1);
-                co_yield context->copier()->copy(v_value_11, s_value_1, "copy to v1");
-
-                // Compute the value 2
-                Register::ValuePtr s_value_2 = nullptr;
-                co_yield Expression::generate(s_value_2, workitemcount_Y, context);
-                auto v_value_22 = Register::Value::Placeholder(
-                    context, Register::Type::Vector, DataType::Int64, 1);
-                co_yield context->copier()->copy(v_value_22, s_value_2, "copy to v2");
-
-                co_yield context->mem()->storeGlobal(v_offset_1, v_value_11, 0, 8);
-                co_yield context->mem()->storeGlobal(v_offset_2, v_value_22, 0, 8);
-            };
-        }
-
-        // For now, mainly for debugging, directly copy the two results values.
-        // workitemCount is passed as argument. Since it is 64-bit
-        // different global_store_dwordx2 format should be used.
-        auto kb_equal_one(ContextPtr                                      context,
-                          Expression::ExpressionPtr const&                input,
-                          Expression::ExpressionPtr const&                widenedInput,
-                          std::array<Expression::ExpressionPtr, 3> const& workitemCount,
-                          std::array<unsigned int, 3> const&              workgroupSize)
-        {
-            return [context, input, widenedInput, workitemCount, workgroupSize]()
-                       -> Generator<Instruction> {
-                // store base addr
-                Register::ValuePtr s_ptr;
-                co_yield context->argLoader()->getValue("rv_ptr", s_ptr);
-
-                Register::ValuePtr s_ptr2;
-                co_yield context->argLoader()->getValue("rv_ptr2", s_ptr2);
-
-                auto compare_res_pointer
-                    = get64BitVectorOffset(context, workitemCount, workgroupSize);
-                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
-
-                Register::ValuePtr v_offset_1 = nullptr;
-                co_yield Expression::generate(
-                    v_offset_1, compare_res_pointer + s_ptr->expression(), context);
-
-                Register::ValuePtr v_offset_2 = nullptr;
-                co_yield Expression::generate(
-                    v_offset_2, compare_res_pointer + s_ptr2->expression(), context);
-
-                // Compute the value 1
-                Register::ValuePtr v_value_1 = nullptr;
-                co_yield Expression::generate(v_value_1, input, context);
-
-                // Compute the value 2
-                Register::ValuePtr v_value_2 = nullptr;
-
-                co_yield Expression::generate(v_value_2, widenedInput, context);
-
-                co_yield context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8);
-                co_yield context->mem()->storeGlobal(v_offset_2, v_value_2, 0, 8);
-            };
-        }
-
         void setTensorArguments(CommandArguments&                  commandArgs,
                                 GEMMProblem const&                 problem,
                                 rocRollerTest::Graphs::GEMM const& gemm,
@@ -600,11 +471,50 @@ namespace AddressCalculationTest
             generateOrigKernelByProlog();
 
             auto k = m_context->kernel();
-            m_context->schedule(kb_implicit_workitemcount(
-                m_context,
-                k->workitemCount()[0],
-                k->workitemCount()[1],
-                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+
+            // [context, workitemcount_X, workitemcount_Y, workgroupSize] are used.
+            // Just print out passed expressions to device memory.
+            // If passed expressions are workitemCounts, the expectation is that
+            // the generated expressions' values are the same with the host-side computed values.
+            auto kb = [&]() -> Generator<Instruction> {
+                // store base addrs
+                Register::ValuePtr s_ptr;
+                co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr);
+                Register::ValuePtr s_ptr2;
+                co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2);
+                auto compare_res_pointer = get64BitVectorOffset(
+                    m_context,
+                    k->workitemCount(),
+                    (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value());
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset_1 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_1, compare_res_pointer + s_ptr->expression(), m_context);
+
+                Register::ValuePtr v_offset_2 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_2, compare_res_pointer + s_ptr2->expression(), m_context);
+
+                // Compute the value 1
+                Register::ValuePtr s_value_1 = nullptr;
+                co_yield Expression::generate(s_value_1, k->workitemCount()[0], m_context);
+                auto v_value_11 = Register::Value::Placeholder(
+                    m_context, Register::Type::Vector, DataType::Int64, 1);
+                co_yield m_context->copier()->copy(v_value_11, s_value_1, "copy to v1");
+
+                // Compute the value 2
+                Register::ValuePtr s_value_2 = nullptr;
+                co_yield Expression::generate(s_value_2, k->workitemCount()[1], m_context);
+                auto v_value_22 = Register::Value::Placeholder(
+                    m_context, Register::Type::Vector, DataType::Int64, 1);
+                co_yield m_context->copier()->copy(v_value_22, s_value_2, "copy to v2");
+
+                co_yield m_context->mem()->storeGlobal(v_offset_1, v_value_11, 0, 8);
+                co_yield m_context->mem()->storeGlobal(v_offset_2, v_value_22, 0, 8);
+            };
+
+            m_context->schedule(kb());
 
             m_context->schedule(k->postamble());
             m_context->schedule(k->amdgpu_metadata());
@@ -632,16 +542,47 @@ namespace AddressCalculationTest
             }
         }
 
-        void test_sanity_indices()
+        void test_wg_thr_indices()
         {
             generateOrigKernelByProlog();
 
             auto k = m_context->kernel();
 
-            m_context->schedule(kb_sanity_indices(
-                m_context,
-                k->workitemCount(),
-                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+            // This is for printing out workgroup index and thread index
+            // [context, workitemCount, workgroupSize] are used.
+            auto kb = [&]() -> Generator<Instruction> {
+                // store base addr
+                Register::ValuePtr s_ptr;
+                co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr);
+
+                Register::ValuePtr s_ptr2;
+                co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2);
+
+                auto compare_res_pointer = get64BitVectorOffset(
+                    m_context,
+                    k->workitemCount(),
+                    (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value());
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset_1 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_1, compare_res_pointer + s_ptr->expression(), m_context);
+
+                Register::ValuePtr v_offset_2 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_2, compare_res_pointer + s_ptr2->expression(), m_context);
+
+                // workgroupIndex x
+                auto v_wg_x = Register::Value::Placeholder(
+                    m_context, Register::Type::Vector, DataType::UInt32, 1);
+                co_yield m_context->copier()->copy(
+                    v_wg_x, k->workgroupIndex()[0], "copy wgi.x to v");
+                co_yield m_context->mem()->storeGlobal(v_offset_1, v_wg_x, 0, 4);
+
+                co_yield m_context->mem()->storeGlobal(v_offset_2, (k->workitemIndex())[0], 0, 4);
+            };
+
+            m_context->schedule(kb());
 
             m_context->schedule(k->postamble());
             m_context->schedule(k->amdgpu_metadata());
@@ -683,7 +624,7 @@ namespace AddressCalculationTest
             }
         }
 
-        void test_equal_one()
+        void test_equal_one_pair()
         {
             generateOrigKernelByProlog();
 
@@ -704,12 +645,45 @@ namespace AddressCalculationTest
             Log::debug("** fast : {} ", toString(fast(widenedExprPtrs.back())));
 
             auto k = m_context->kernel();
-            m_context->schedule(kb_equal_one(
-                m_context,
-                indexExprPtrs[0],
-                widenedExprPtrs[0],
-                k->workitemCount(),
-                (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value())());
+
+            // context, input, widenedInput, workitemCount, workgroupSize, are used
+            // inside the kb.
+            auto kb = [&]() -> Generator<Instruction> {
+                // store base addr
+                Register::ValuePtr s_ptr;
+                co_yield m_context->argLoader()->getValue("rv_ptr", s_ptr);
+
+                Register::ValuePtr s_ptr2;
+                co_yield m_context->argLoader()->getValue("rv_ptr2", s_ptr2);
+
+                auto compare_res_pointer = get64BitVectorOffset(
+                    m_context,
+                    k->workitemCount(),
+                    (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value());
+                Log::debug("Offset in kb: {}", toString(compare_res_pointer));
+
+                Register::ValuePtr v_offset_1 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_1, compare_res_pointer + s_ptr->expression(), m_context);
+
+                Register::ValuePtr v_offset_2 = nullptr;
+                co_yield Expression::generate(
+                    v_offset_2, compare_res_pointer + s_ptr2->expression(), m_context);
+
+                // Compute the value 1
+                Register::ValuePtr v_value_1 = nullptr;
+                co_yield Expression::generate(v_value_1, indexExprPtrs[0], m_context);
+
+                // Compute the value 2
+                Register::ValuePtr v_value_2 = nullptr;
+
+                co_yield Expression::generate(v_value_2, widenedExprPtrs[0], m_context);
+
+                co_yield m_context->mem()->storeGlobal(v_offset_1, v_value_1, 0, 8);
+                co_yield m_context->mem()->storeGlobal(v_offset_2, v_value_2, 0, 8);
+            };
+
+            m_context->schedule(kb());
 
             m_context->schedule(k->postamble());
             m_context->schedule(k->amdgpu_metadata());
@@ -727,7 +701,7 @@ namespace AddressCalculationTest
             }
         }
 
-        void test_equal()
+        void test_equal_all_pairs()
         {
             generateOrigKernelByProlog();
 
@@ -748,6 +722,8 @@ namespace AddressCalculationTest
             auto allone_uint64
                 = std::make_shared<Expression::Expression>(static_cast<uint64_t>(0xFFFFFFFF));
 
+            auto k = m_context->kernel();
+
             // context, input, widenedInput, workitemCount, workgroupSize, allone_uint64 are used
             // inside the kb.
             auto kb = [&]() -> Generator<Instruction> {
@@ -763,7 +739,7 @@ namespace AddressCalculationTest
                 // 2-D
                 auto compare_res_pointer = get64BitVectorOffset(
                     m_context,
-                    m_context->kernel()->workitemCount(),
+                    k->workitemCount(),
                     (m_commandKernel->getCommandParameters()->getManualWorkgroupSize()).value());
                 Log::debug("Offset in kb: {}", toString(compare_res_pointer));
 
@@ -819,7 +795,6 @@ namespace AddressCalculationTest
 
             m_context->schedule(kb());
 
-            auto k = m_context->kernel();
             m_context->schedule(k->postamble());
             m_context->schedule(k->amdgpu_metadata());
 
@@ -909,12 +884,13 @@ namespace AddressCalculationTest
 
                     // Generate a kernel for testing address calculation and run.
                     // Verification of the result is done.
-                    kernel.test_equal();
+                    kernel.test_equal_all_pairs();
                 }
             }
         }
     }
 
+    // Following test checks only one-pair, simpler version of above.
     TEST_CASE("address calculation test generate and run one pair", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "128x128_one_pair");
@@ -924,9 +900,10 @@ namespace AddressCalculationTest
         gemm.setProblem(problem);
 
         AddressCalculationTest kernel(context.get(), problem, gemm);
-        kernel.test_equal_one();
+        kernel.test_equal_one_pair();
     }
 
+    // Sanity check 1
     TEST_CASE("address calculation test implicit workitemcount", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "impl_workitemcnt");
@@ -939,7 +916,8 @@ namespace AddressCalculationTest
         kernel.test_implicit_workitemcount();
     }
 
-    TEST_CASE("address calculation test sanity check", "[expression][gpu]")
+    // Sanity check 2
+    TEST_CASE("address calculation test workgroup thread index", "[expression][gpu]")
     {
         auto context = TestContext::ForTestDevice({}, "128x128_sanity_indices");
 
@@ -948,6 +926,6 @@ namespace AddressCalculationTest
         gemm.setProblem(problem);
 
         AddressCalculationTest kernel(context.get(), problem, gemm);
-        kernel.test_sanity_indices();
+        kernel.test_wg_thr_indices();
     }
 }
\ No newline at end of file

From b542e9370089fac5c867fb4040c844d24c2d241d Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Thu, 20 Mar 2025 16:13:19 +0000
Subject: [PATCH 29/32] Fix an assertion failure by hoisting coordinate
 transformer's fillExecutionCoordinates() out of a loop body

---
 .../InstructionValues/Register_impl.hpp         |  3 ---
 .../KernelGraph/RegisterTagManager_impl.hpp     |  5 -----
 test/catch/AddressCalculationTest.cpp           | 17 ++++++++---------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/lib/include/rocRoller/InstructionValues/Register_impl.hpp b/lib/include/rocRoller/InstructionValues/Register_impl.hpp
index 083d24c1..e4dc7e51 100644
--- a/lib/include/rocRoller/InstructionValues/Register_impl.hpp
+++ b/lib/include/rocRoller/InstructionValues/Register_impl.hpp
@@ -840,9 +840,6 @@ namespace rocRoller
             std::vector<int> coords;
             for(auto i : indices)
             {
-                if (i < 0 || (size_t)i >= m_allocationCoord.size()) {
-                    Log::debug("FA i {}", i);
-                }
                 AssertFatal(i >= 0 && (size_t)i < m_allocationCoord.size(),
                             "Register subset out of bounds.",
                             ShowValue(m_allocationCoord.size()),
diff --git a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp
index 7efbc999..dffd2fd2 100644
--- a/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp
+++ b/lib/include/rocRoller/KernelGraph/RegisterTagManager_impl.hpp
@@ -122,13 +122,8 @@ namespace rocRoller
 
     inline void RegisterTagManager::addRegister(int tag, Register::ValuePtr value)
     {
-
         AssertFatal(!hasExpression(tag), "Tag already associated with an expression");
-#if 0
-        // Assertion fails on AddressCalculationTest - s2, s3 for workgroupIndex X and Y.
         AssertFatal(!hasRegister(tag), "Tag ", tag, " already in RegisterTagManager.");
-#endif
-
         if(auto existingTag = findRegister(value))
         {
             AssertFatal(value->readOnly(),
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 3515bc5d..f3b829ff 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -76,6 +76,14 @@ namespace AddressCalculationTest
         std::vector<Expression::ExpressionPtr> rv;
         auto                                   root  = m_kGraph.control.roots().only();
         int                                    count = 0;
+
+        // Note that identity_transduer is intentionally being used here in place of FastArithmetic.
+        // It is alright to use FastArithmetic here but
+        // fastArithmetic is being used eventually when the expressions are generated.
+        auto identity_transducer = [&](auto expr) { return expr; };
+        auto coords = CoordG::Transformer(&(m_kGraph.coordinates), identity_transducer);
+        coords.fillExecutionCoordinates(m_context);
+
         for(auto ciTag : filter(isComputeIndex, m_kGraph.control.depthFirstVisit(root.value())))
         {
 
@@ -124,15 +132,6 @@ namespace AddressCalculationTest
                                       KernelG::Connections::ComputeIndex{
                                           KernelG::Connections::ComputeIndexArgument::INCREMENT});
 
-            // Note that identity_transduer is intentionally being used here in place of FastArithmetic.
-            // It is alright to use FastArithmetic here but
-            // fastArithmetic is being used eventually when the expressions are generated.
-            auto identity_transducer = [&](auto expr) { return expr; };
-            auto coords              = CoordG::Transformer(
-
-                &(m_kGraph.coordinates), identity_transducer);
-            coords.fillExecutionCoordinates(m_context);
-
             auto fullStop  = [&](int tag) { return tag == increment; };
             auto direction = ci.forward ? Graph::Direction::Upstream : Graph::Direction::Downstream;
             auto [required, path] = findRequiredCoordinates(target, direction, fullStop, m_kGraph);

From a529b85edbe9f5a3cb7452ac16369b8e4ce970b9 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Thu, 20 Mar 2025 16:19:49 -0400
Subject: [PATCH 30/32] Small changes

---
 lib/source/ExpressionTransformations/FastDivision.cpp | 6 ------
 test/catch/AddressCalculationTest.cpp                 | 6 +++---
 test/common/common/TestValues.hpp                     | 3 ---
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/lib/source/ExpressionTransformations/FastDivision.cpp b/lib/source/ExpressionTransformations/FastDivision.cpp
index aeef0631..c5af98e4 100644
--- a/lib/source/ExpressionTransformations/FastDivision.cpp
+++ b/lib/source/ExpressionTransformations/FastDivision.cpp
@@ -329,9 +329,6 @@ namespace rocRoller
         {
             unsigned int mask    = rhs - 1u;
             auto         new_rhs = literal(mask);
-#if 1
-            Log::debug("Yoonseo - umodulo - name {} toString {} mask {}", name(new_rhs), toString(new_rhs), mask);
-#endif
             return lhs & new_rhs;
         }
 
@@ -345,9 +342,6 @@ namespace rocRoller
             int          mask               = ~(rhs - 1);
 
             auto maskExpr               = literal(mask);
-#if 1
-            Log::debug("Yoonseo - smodulo - name {} toString {} mask {}", name(maskExpr), toString(maskExpr), mask);
-#endif
             auto signBitsExpr           = literal(signBits);
             auto reverseShiftAmountExpr = literal(reverseShiftAmount);
 
diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index f3b829ff..cb4de599 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -894,7 +894,7 @@ namespace AddressCalculationTest
     {
         auto context = TestContext::ForTestDevice({}, "128x128_one_pair");
 
-        GEMMProblem                 problem{.m = 128, .n = 128};
+        GEMMProblem                 problem{.m = 128, .n = 128, .macM = 64, .macN = 64, .macK = 64};
         rocRollerTest::Graphs::GEMM gemm(DataType::Float);
         gemm.setProblem(problem);
 
@@ -907,7 +907,7 @@ namespace AddressCalculationTest
     {
         auto context = TestContext::ForTestDevice({}, "impl_workitemcnt");
 
-        GEMMProblem                 problem{.m = 128, .n = 128};
+        GEMMProblem                 problem{.m = 128, .n = 128, .macM = 64, .macN = 64, .macK = 64};
         rocRollerTest::Graphs::GEMM gemm(DataType::Float);
         gemm.setProblem(problem);
 
@@ -920,7 +920,7 @@ namespace AddressCalculationTest
     {
         auto context = TestContext::ForTestDevice({}, "128x128_sanity_indices");
 
-        GEMMProblem                 problem{.m = 128, .n = 128};
+        GEMMProblem                 problem{.m = 128, .n = 128, .macM = 64, .macN = 64, .macK = 64};
         rocRollerTest::Graphs::GEMM gemm(DataType::Float);
         gemm.setProblem(problem);
 
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index b73ba254..77db2372 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -152,9 +152,6 @@ namespace TestValues
         = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}};
 
     inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}, {128, 128}};
-    // Looks like {128, 128} fails
-    // {{64, 64}, {128, 128}};
-    // Also with the current set up - datatype "double" fails.
 
     template <typename T>
     struct ByType

From 8a8439dc0356bdf010bc6edec8ad95a896b5b785 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Tue, 25 Mar 2025 20:08:20 +0000
Subject: [PATCH 31/32] Adding transpose

---
 test/catch/AddressCalculationTest.cpp | 29 ++++++++++++++++++---------
 test/common/common/TestValues.hpp     |  2 +-
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index cb4de599..07b3d562 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -59,7 +59,7 @@ namespace AddressCalculationTest
     public:
         AddressTrace(KernelGraphType const& graph, ContextPtr ctx)
             : m_kGraph(graph)
-            , m_context(ctx){};
+            , m_context(ctx) {};
         std::vector<Expression::ExpressionPtr> traceComputeIndexWithBuffer();
 
     private:
@@ -873,17 +873,28 @@ namespace AddressCalculationTest
                     auto macroTileString = std::to_string(macM) + "x" + std::to_string(macN);
                     auto suffixForKernelName
                         = toString(singleDataType) + "_" + probSizeString + "_" + macroTileString;
-                    auto context = TestContext::ForTestDevice({}, suffixForKernelName);
 
-                    GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN};
-                    rocRollerTest::Graphs::GEMM gemm(singleDataType);
-                    gemm.setProblem(problem);
+                    auto [transA, transB]
+                        = GENERATE(values<std::pair<std::string, std::string>>({{"N", "T"}, {"N", "N"}, {"T", "N"}, {"T", "T"}}));
+                    DYNAMIC_SECTION(transA << transB)
+                    {
+                        std::cout << "transA: " << transA << "\n";
+                        std::cout << "transB: " << transB << "\n";
+
+                        suffixForKernelName += "_" + transA + transB;
+                        auto context = TestContext::ForTestDevice({}, suffixForKernelName);
 
-                    AddressCalculationTest kernel(context.get(), problem, gemm);
+                        GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN, 
+                                            .transA = transA, .transB = transB};
+                        rocRollerTest::Graphs::GEMM gemm(singleDataType);
+                        gemm.setProblem(problem);
 
-                    // Generate a kernel for testing address calculation and run.
-                    // Verification of the result is done.
-                    kernel.test_equal_all_pairs();
+                        AddressCalculationTest kernel(context.get(), problem, gemm);
+
+                        // Generate a kernel for testing address calculation and run.
+                        // Verification of the result is done.
+                        kernel.test_equal_all_pairs();
+                    }
                 }
             }
         }
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index 77db2372..d0357a8a 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -151,7 +151,7 @@ namespace TestValues
     inline std::initializer_list<GemmProblemSize> gemmProblemSizes
         = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}};
 
-    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}, {128, 128}};
+    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}, {128, 128}, {256, 256}};
 
     template <typename T>
     struct ByType

From f72ba8fa3ead1cbde5e5d245c4f5e48c58080e26 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi@amd.com>
Date: Thu, 27 Mar 2025 18:51:00 +0000
Subject: [PATCH 32/32] Clean up some old comments

---
 test/catch/AddressCalculationTest.cpp | 17 +++++++++++------
 test/common/common/TestValues.hpp     |  4 +++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/test/catch/AddressCalculationTest.cpp b/test/catch/AddressCalculationTest.cpp
index 07b3d562..c5e677f7 100644
--- a/test/catch/AddressCalculationTest.cpp
+++ b/test/catch/AddressCalculationTest.cpp
@@ -59,7 +59,7 @@ namespace AddressCalculationTest
     public:
         AddressTrace(KernelGraphType const& graph, ContextPtr ctx)
             : m_kGraph(graph)
-            , m_context(ctx) {};
+            , m_context(ctx){};
         std::vector<Expression::ExpressionPtr> traceComputeIndexWithBuffer();
 
     private:
@@ -846,8 +846,8 @@ namespace AddressCalculationTest
         // Or, it could be simply a bug.
 
         // Called single as the one data type is applied to all A, B, C and D matrices.
-        // TODO: check transpose
 
+        // To cut down execution time further, consider running only large matrices.
         auto singleDataType = GENERATE(DataType::Float, DataType::Double);
 
         std::cout << "singleType: " << singleDataType << "\n";
@@ -874,8 +874,8 @@ namespace AddressCalculationTest
                     auto suffixForKernelName
                         = toString(singleDataType) + "_" + probSizeString + "_" + macroTileString;
 
-                    auto [transA, transB]
-                        = GENERATE(values<std::pair<std::string, std::string>>({{"N", "T"}, {"N", "N"}, {"T", "N"}, {"T", "T"}}));
+                    auto [transA, transB] = GENERATE(values<std::pair<std::string, std::string>>(
+                        {{"N", "T"}, {"N", "N"}, {"T", "N"}, {"T", "T"}}));
                     DYNAMIC_SECTION(transA << transB)
                     {
                         std::cout << "transA: " << transA << "\n";
@@ -884,8 +884,13 @@ namespace AddressCalculationTest
                         suffixForKernelName += "_" + transA + transB;
                         auto context = TestContext::ForTestDevice({}, suffixForKernelName);
 
-                        GEMMProblem problem{.m = m, .n = n, .k = k, .macM = macM, .macN = macN, 
-                                            .transA = transA, .transB = transB};
+                        GEMMProblem                 problem{.m      = m,
+                                            .n      = n,
+                                            .k      = k,
+                                            .macM   = macM,
+                                            .macN   = macN,
+                                            .transA = transA,
+                                            .transB = transB};
                         rocRollerTest::Graphs::GEMM gemm(singleDataType);
                         gemm.setProblem(problem);
 
diff --git a/test/common/common/TestValues.hpp b/test/common/common/TestValues.hpp
index d0357a8a..9fc4116d 100644
--- a/test/common/common/TestValues.hpp
+++ b/test/common/common/TestValues.hpp
@@ -151,7 +151,9 @@ namespace TestValues
     inline std::initializer_list<GemmProblemSize> gemmProblemSizes
         = {{128, 128, 128}, {512, 512, 128}, {1024, 1024, 256}};
 
-    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}, {128, 128}, {256, 256}};
+    // Notice that {256, 256} was intentionally avoided due to extremely prolonged time of
+    // code generation.
+    inline std::initializer_list<std::pair<int, int>> macroTileSizes = {{64, 64}, {128, 128}};
 
     template <typename T>
     struct ByType