hw-native-sys · Hzfengsy · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 6, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -104,6 +104,7 @@ set(PYPTO_SOURCES
     src/ir/op/tile_ops/transform.cpp
     src/ir/op/tile_ops/unary.cpp
     src/ir/op/tile_ops/cross_core.cpp
+    src/ir/op/tile_ops/utility.cpp
     src/ir/op/sync_ops/sync.cpp
     src/ir/op/sync_ops/cross_core.cpp
     src/ir/op/tensor_ops/broadcast.cpp
@@ -114,6 +115,7 @@ set(PYPTO_SOURCES
     src/ir/op/tensor_ops/reduction.cpp
     src/ir/op/tensor_ops/transform.cpp
     src/ir/op/tensor_ops/unary.cpp
+    src/ir/op/tensor_ops/utility.cpp
     src/ir/op/testing.cpp
     src/ir/op/type_inference.cpp
 

diff --git a/python/pypto/backend/pto_backend.py b/python/pypto/backend/pto_backend.py
@@ -330,6 +330,21 @@ def _generate_kernel_wrapper(func: _ir_core.Function, ptoas_code: str) -> str:
     3. ``kernel_entry`` wrapper with arg unpacking and forward call
     """
     header = _KERNEL_HEADER.format(func_name=func.name)
+    # TPRINT is guarded by #ifdef _DEBUG in pto-isa headers.  Defining
+    # _DEBUG globally is too broad (it enables cce::printf calls that don't
+    # compile on simulation).  Instead, provide a no-op fallback so the
+    # generated code compiles in all environments.
+    if "TPRINT" in ptoas_code:
+        header = header.replace(
+            "using namespace pto;",
+            "using namespace pto;\n\n"
+            "#ifndef _DEBUG\n"
+            "namespace pto {\n"
+            "template <typename T>\n"
+            "PTO_INST void TPRINT(T& /*src*/) {}\n"
+            "} // namespace pto\n"
+            "#endif",
+        )
     ptoas_body = _preprocess_ptoas_output(ptoas_code)
     unpacking_code, var_names = _generate_arg_unpacking(func)
     call_args = ", ".join(var_names)

diff --git a/python/pypto/ir/op/tensor_ops.py b/python/pypto/ir/op/tensor_ops.py
@@ -940,3 +940,19 @@ def scatter_update(
     op_args: list[Expr] = [input, index, src]
     kwargs: dict[str, Any] = {"dim": dim_val}
     return _ir_core.create_op_call("tensor.scatter_update", op_args, kwargs, actual_span)
+
+
+def runtime_print(tensor: Expr, span: Span | None = None) -> Call:
+    """Print tensor contents at runtime for debugging.
+
+    Generates a pto.tprint instruction in the compiled output.
+
+    Args:
+        tensor: Input tensor expression (TensorType)
+        span: Optional source span for debugging (auto-captured if not provided)
+
+    Returns:
+        Call expression (type is pass-through TensorType)
+    """
+    actual_span = _get_span_or_capture(span)
+    return _ir_core.create_op_call("tensor.runtime_print", [tensor], {}, actual_span)
diff --git a/python/pypto/ir/op/tile_ops.py b/python/pypto/ir/op/tile_ops.py
@@ -1950,3 +1950,19 @@ def tpop_from_aiv(
         op = _ir_core.get_op("tile.tpop_from_aiv")
         return _ir_core.Call(op, [], {"split": split}, resolved_type, actual_span)
     return _ir_core.create_op_call("tile.tpop_from_aiv", [], {"split": split}, actual_span)
+
+
+def runtime_print(tile: Expr, span: Span | None = None) -> Call:
+    """Print tile contents at runtime for debugging.
+
+    Generates a pto.tprint instruction in the compiled output.
+
+    Args:
+        tile: Input tile expression (TileType)
+        span: Optional source span for debugging (auto-captured if not provided)
+
+    Returns:
+        Call expression (type is pass-through TileType)
+    """
+    actual_span = _get_span_or_capture(span)
+    return _ir_core.create_op_call("tile.runtime_print", [tile], {}, actual_span)
diff --git a/python/pypto/language/__init__.py b/python/pypto/language/__init__.py
@@ -159,6 +159,7 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     row_min,
     row_sum,
     rsqrt,
+    runtime_print,
     slice,
     sqrt,
     sub,
@@ -264,6 +265,7 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     "recip",
     "read",
     "write",
+    "runtime_print",
     # Promoted tile-only
     "create_tile",
     "fillpad",

diff --git a/python/pypto/language/op/tensor_ops.py b/python/pypto/language/op/tensor_ops.py
@@ -59,6 +59,7 @@
     "reshape",
     "transpose",
     "scatter_update",
+    "runtime_print",
 ]
 
 from pypto.ir.op import tensor_ops as _ir_ops
@@ -779,3 +780,15 @@ def scatter_update(
     """
     call_expr = _ir_ops.scatter_update(input.unwrap(), dim, index.unwrap(), src.unwrap())
     return Tensor(expr=call_expr)
+
+
+def runtime_print(tensor: Tensor) -> None:
+    """Print tensor contents at runtime for debugging.
+
+    Generates a pto.tprint instruction in the compiled output.
+    This is a statement-only operation — no value is returned.
+
+    Args:
+        tensor: Input tensor to print
+    """
+    _ir_ops.runtime_print(tensor.unwrap())
diff --git a/python/pypto/language/op/tile_ops.py b/python/pypto/language/op/tile_ops.py
@@ -108,6 +108,7 @@
     "tpush_to_aic",
     "tpop_from_aic",
     "tpop_from_aiv",
+    "runtime_print",
 ]
 
 from pypto.ir.op import tile_ops as _ir_ops
@@ -1547,3 +1548,15 @@ def sels(lhs: Tile, rhs: Tile, select_mode: int | float | Expr | Scalar) -> Tile
     select_mode_expr = select_mode.unwrap() if isinstance(select_mode, Scalar) else select_mode
     call_expr = _ir_ops.sels(lhs.unwrap(), rhs.unwrap(), select_mode_expr)
     return Tile(expr=call_expr)
+
+
+def runtime_print(tile: Tile) -> None:
+    """Print tile contents at runtime for debugging.
+
+    Generates a pto.tprint instruction in the compiled output.
+    This is a statement-only operation — no value is returned.
+
+    Args:
+        tile: Input tile to print
+    """
+    _ir_ops.runtime_print(tile.unwrap())
diff --git a/python/pypto/language/op/unified_ops.py b/python/pypto/language/op/unified_ops.py
@@ -53,6 +53,7 @@
     "create_tile",
     "read",
     "write",
+    "runtime_print",
 ]
 
 from pypto.ir.utils import resolve_cast_mode
@@ -552,3 +553,19 @@ def write(dst: Tensor | Tile, offset: IntLike | Sequence[IntLike], value: Scalar
     if isinstance(dst, Tile):
         return _tile.write(dst, offset, value)
     raise TypeError(f"write: expected Tensor or Tile, got {type(dst).__name__}")
+
+
+def runtime_print(src: Tensor | Tile) -> None:
+    """Print tensor or tile contents at runtime for debugging.
+
+    Generates a pto.tprint instruction in the compiled output.
+    This is a statement-only operation — no value is returned.
+
+    Args:
+        src: Tensor or tile to print
+    """
+    if isinstance(src, Tensor):
+        return _tensor.runtime_print(src)
+    if isinstance(src, Tile):
+        return _tile.runtime_print(src)
+    raise TypeError(f"runtime_print: expected Tensor or Tile, got {type(src).__name__}")
diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp
@@ -364,7 +364,13 @@ static std::string MakePrintCodegenPTO(const std::string& pto_op_name, const Cal
   CHECK(op->args_.size() == 1) << "Operation:" << pto_op_name << "] requires 1 argument, but got "
                                << op->args_.size();
   std::string src = codegen.GetExprAsCode(op->args_[0]);
-  codegen.Emit(pto_op_name + " ins(" + src + " | !pto.partition_tensor_view<MxNxdtype>)");
+  std::string src_type = codegen.GetExprTypeAnnotation(op->args_[0]);
+  std::string line = pto_op_name + " ins(" + src;
+  if (!src_type.empty()) {
+    line += " : " + src_type;
+  }
+  line += ")";
+  codegen.Emit(line);
   return "";
 }
 
@@ -1234,9 +1240,11 @@ void RegisterPTOOps(Backend& backend, const std::unordered_set<std::string>& exc
   reg("tile.mrgsort", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
     return MakeMrgSortCodegenPTO("pto.tmrgsort", op, codegen);
   });
-  reg("tile.print", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
+  auto make_tprint = [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
     return MakePrintCodegenPTO("pto.tprint", op, codegen);
-  });
+  };
+  reg("tile.runtime_print", make_tprint);
+  reg("tensor.runtime_print", make_tprint);
 
   // In-place accumulation ops (matmul_acc, gemv_acc): ptoas expects the
   // accumulator in ins() to be the same SSA value as outs().  InitMemRef

diff --git a/src/ir/op/tensor_ops/utility.cpp b/src/ir/op/tensor_ops/utility.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file utility.cpp
+ * @brief Utility tensor operations (print)
+ *
+ * This file implements utility/debugging operations for tensor-level programming.
+ */
+
+#include <any>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pypto/core/logging.h"
+#include "pypto/ir/kind_traits.h"
+#include "pypto/ir/op_registry.h"
+#include "pypto/ir/type.h"
+
+namespace pypto {
+namespace ir {
+
+TypePtr DeduceTensorPrintType(const std::vector<ExprPtr>& args,
+                              const std::vector<std::pair<std::string, std::any>>& kwargs,
+                              const std::string& op_name) {
+  CHECK(args.size() == 1) << "The operator " << op_name << " requires 1 argument (tensor), but got "
+                          << args.size();
+  auto tensor_type = As<TensorType>(args[0]->GetType());
+  CHECK(tensor_type) << "The operator " << op_name << " requires argument to be a TensorType, but got "
+                     << args[0]->GetType()->TypeName();
+  // Pass-through: returns the input tensor type (print is a side-effect operation)
+  return tensor_type;
+}
+
+REGISTER_OP("tensor.runtime_print")
+    .set_op_category("TensorOp")
+    .set_description("Print tensor contents for debugging (generates pto.tprint)")
+    .add_argument("tensor", "Input tensor to print (TensorType)")
+    .f_deduce_type([](const std::vector<ExprPtr>& args,
+                      const std::vector<std::pair<std::string, std::any>>& kwargs) {
+      return DeduceTensorPrintType(args, kwargs, "tensor.runtime_print");
+    });
+
+}  // namespace ir
+}  // namespace pypto
diff --git a/src/ir/op/tile_ops/utility.cpp b/src/ir/op/tile_ops/utility.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file utility.cpp
+ * @brief Utility tile operations (print)
+ *
+ * This file implements utility/debugging operations for tile-level programming.
+ */
+
+#include <any>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pypto/core/logging.h"
+#include "pypto/ir/kind_traits.h"
+#include "pypto/ir/op_registry.h"
+#include "pypto/ir/type.h"
+
+namespace pypto {
+namespace ir {
+
+TypePtr DeduceTilePrintType(const std::vector<ExprPtr>& args,
+                            const std::vector<std::pair<std::string, std::any>>& kwargs,
+                            const std::string& op_name) {
+  CHECK(args.size() == 1) << "The operator " << op_name << " requires 1 argument (tile), but got "
+                          << args.size();
+  auto tile_type = As<TileType>(args[0]->GetType());
+  CHECK(tile_type) << "The operator " << op_name << " requires argument to be a TileType, but got "
+                   << args[0]->GetType()->TypeName();
+  // Pass-through: returns the input tile type (print is a side-effect operation)
+  return tile_type;
+}
+
+REGISTER_OP("tile.runtime_print")
+    .set_op_category("TileOp")
+    .set_description("Print tile contents for debugging (generates pto.tprint)")
+    .add_argument("tile", "Input tile to print (TileType)")
+    .no_memory_spec()
+    .f_deduce_type([](const std::vector<ExprPtr>& args,
+                      const std::vector<std::pair<std::string, std::any>>& kwargs) {
+      return DeduceTilePrintType(args, kwargs, "tile.runtime_print");
+    });
+
+}  // namespace ir
+}  // namespace pypto
diff --git a/src/ir/transforms/op_conversion_registry.cpp b/src/ir/transforms/op_conversion_registry.cpp
@@ -158,6 +158,42 @@ OpConversionRegistry::OpConversionRegistry() {
   // Memory creation ops
   RegisterSimple("tensor.full", "tile.full");
 
+  // Utility ops — runtime_print needs a custom converter because the
+  // argument may still be a TensorType (e.g. printing a function parameter
+  // before any explicit tile.load).  In that case we insert a tile.load
+  // prologue to materialise the tile, matching the tensor.fillpad pattern.
+  RegisterCustom(
+      "tensor.runtime_print",
+      [](const std::vector<ExprPtr>& args, const std::vector<std::pair<std::string, std::any>>& kwargs,
+         const Span& span) -> ConversionResult {
+        CHECK(args.size() == 1) << "tensor.runtime_print conversion expects 1 arg (input)";
+        auto& op_reg = OpRegistry::GetInstance();
+        const auto& input = args[0];
+
+        // Already a tile — pass through.
+        if (As<TileType>(input->GetType())) {
+          return ConversionResult{op_reg.Create("tile.runtime_print", {input}, span)};
+        }
+
+        auto tensor_type = As<TensorType>(input->GetType());
+        CHECK(tensor_type) << "tensor.runtime_print conversion: input must be TensorType or TileType, got "
+                           << input->GetType()->TypeName();
+
+        auto offsets = MakeZeroOffsetsTuple(tensor_type->shape_.size(), span);
+        auto shapes = MakeShapesTuple(tensor_type->shape_, span);
+
+        std::vector<std::pair<std::string, std::any>> load_kwargs = {{"target_memory", MemorySpace::Vec},
+                                                                     {"transpose", false}};
+        auto load_call = op_reg.Create("tile.load", {input, offsets, shapes, shapes}, load_kwargs, span);
+        auto load_var = std::make_shared<Var>("runtime_print_src", load_call->GetType(), span);
+
+        std::vector<StmtPtr> prologue;
+        prologue.push_back(std::make_shared<AssignStmt>(load_var, load_call, span));
+
+        auto print_call = op_reg.Create("tile.runtime_print", {load_var}, span);
+        return ConversionResult{std::move(prologue), print_call};
+      });
+
   // ────────────────────────────────────────────────────────────────────────
   // Broadcast-aware elementwise binary ops
   //