From c4946853846283b791097300b93b55b712741805 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 5 May 2026 21:07:19 -0700
Subject: [PATCH 01/59] mlir init

---
 CLAUDE.md                            | 110 ++++++++++
 CMakeLists.txt                       |  35 +++-
 mlir/CMakeLists.txt                  |  28 +++
 mlir/Tools/kun-opt/CMakeLists.txt    |  29 +++
 mlir/Tools/kun-opt/kun-opt.cpp       |  25 +++
 mlir/include/CMakeLists.txt          |   2 +
 mlir/include/KunGpu/CMakeLists.txt   |   1 +
 mlir/include/KunGpu/KunGpuDialect.h  |   6 +
 mlir/include/KunGpu/KunGpuDialect.td |  19 ++
 mlir/include/KunGpu/KunGpuOps.h      |  12 ++
 mlir/include/KunGpu/KunGpuOps.td     |  31 +++
 mlir/include/KunIr/CMakeLists.txt    |   1 +
 mlir/include/KunIr/KunIrDialect.h    |   7 +
 mlir/include/KunIr/KunIrDialect.td   |  21 ++
 mlir/include/KunIr/KunIrOps.h        |  55 +++++
 mlir/include/KunIr/KunIrOps.td       | 215 ++++++++++++++++++++
 mlir/include/KunIr/KunIrTypes.h      |   9 +
 mlir/include/KunIr/KunIrTypes.td     |  35 ++++
 mlir/lib/CMakeLists.txt              |   2 +
 mlir/lib/KunGpu/CMakeLists.txt       |  15 ++
 mlir/lib/KunGpu/KunGpuDialect.cpp    |  19 ++
 mlir/lib/KunGpu/KunGpuOps.cpp        |  11 +
 mlir/lib/KunIr/CMakeLists.txt        |  15 ++
 mlir/lib/KunIr/KunIrDialect.cpp      |  20 ++
 mlir/lib/KunIr/KunIrOps.cpp          | 289 +++++++++++++++++++++++++++
 mlir/lib/KunIr/KunIrTypes.cpp        |  52 +++++
 mlir/test/CMakeLists.txt             |  30 +++
 mlir/test/kungpu/basic.mlir          |  16 ++
 mlir/test/kunir/basic.mlir           | 131 ++++++++++++
 mlir/test/lit.cfg.py                 |  18 ++
 mlir/test/lit.site.cfg.py.in         |  10 +
 31 files changed, 1267 insertions(+), 2 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 mlir/CMakeLists.txt
 create mode 100644 mlir/Tools/kun-opt/CMakeLists.txt
 create mode 100644 mlir/Tools/kun-opt/kun-opt.cpp
 create mode 100644 mlir/include/CMakeLists.txt
 create mode 100644 mlir/include/KunGpu/CMakeLists.txt
 create mode 100644 mlir/include/KunGpu/KunGpuDialect.h
 create mode 100644 mlir/include/KunGpu/KunGpuDialect.td
 create mode 100644 mlir/include/KunGpu/KunGpuOps.h
 create mode 100644 mlir/include/KunGpu/KunGpuOps.td
 create mode 100644 mlir/include/KunIr/CMakeLists.txt
 create mode 100644 mlir/include/KunIr/KunIrDialect.h
 create mode 100644 mlir/include/KunIr/KunIrDialect.td
 create mode 100644 mlir/include/KunIr/KunIrOps.h
 create mode 100644 mlir/include/KunIr/KunIrOps.td
 create mode 100644 mlir/include/KunIr/KunIrTypes.h
 create mode 100644 mlir/include/KunIr/KunIrTypes.td
 create mode 100644 mlir/lib/CMakeLists.txt
 create mode 100644 mlir/lib/KunGpu/CMakeLists.txt
 create mode 100644 mlir/lib/KunGpu/KunGpuDialect.cpp
 create mode 100644 mlir/lib/KunGpu/KunGpuOps.cpp
 create mode 100644 mlir/lib/KunIr/CMakeLists.txt
 create mode 100644 mlir/lib/KunIr/KunIrDialect.cpp
 create mode 100644 mlir/lib/KunIr/KunIrOps.cpp
 create mode 100644 mlir/lib/KunIr/KunIrTypes.cpp
 create mode 100644 mlir/test/CMakeLists.txt
 create mode 100644 mlir/test/kungpu/basic.mlir
 create mode 100644 mlir/test/kunir/basic.mlir
 create mode 100644 mlir/test/lit.cfg.py
 create mode 100644 mlir/test/lit.site.cfg.py.in

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..f2693f4
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,110 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What This Project Is
+
+KunQuant is a compiler, optimizer, and code generator for financial factor expressions (e.g., WorldQuant Alpha101, Qlib Alpha158). It takes Python-defined financial expressions, applies optimization passes, generates C++ code with SIMD and parallelism, and executes it via a Python binding (KunRunner). Achieves 170x+ speedup over naive Pandas.
+
+## Build & Install
+
+```bash
+# Standard install
+pip install .
+
+# Editable install with C++ tests
+KUN_BUILD_TESTS=1 pip install -e .
+```
+
+Build environment variables:
+- `KUN_BUILD_TYPE=Debug|Release` — default: Release
+- `KUN_BUILD_TESTS=1` — enables `KunTest` and `KunCApiTest` targets
+- `KUN_NO_AVX2=1` — disable AVX2/FMA (for older CPUs)
+- `KUN_DEBUG=1` — print internal compiler pass output
+- `KUN_DEBUG_JIT=1` — print JIT C++ compilation details
+
+## Running Tests
+
+```bash
+# Python IR transformation tests (no build required)
+python tests/test.py
+python tests/test2.py
+
+# Streaming mode tests
+python tests/test_stream.py
+
+# C++ runtime tests (requires KUN_BUILD_TESTS=1)
+python tests/test_runtime.py
+
+# Alpha101 correctness (random data)
+python tests/test_alpha101.py
+
+# Integration tests
+bash tests/tests.sh
+```
+
+## Architecture
+
+The pipeline: **Python expression graph → optimization passes → C++ code generation → JIT compile → shared library → KunRunner execution**
+
+### Python Compiler Layer (`KunQuant/`)
+
+- **`Op.py`** — Core IR. All operations inherit from `OpBase`. `Builder` is a thread-local context manager that records ops as they're constructed. Key traits: `WindowedOp`, `ReductionOp`, `SinkOpTrait`, `CrossSectionalOp`.
+- **`Stage.py`** — `Function` holds the op graph; `OpInfo` tracks use counts. Provides topological sort and dead-op elimination.
+- **`Driver.py`** — Orchestrates compilation. `KunCompilerConfig` holds config (dtype, layout, streaming). `compileit()` is the main entry point; `optimize()` runs the pass pipeline.
+- **`ops/`** — Concrete op types: `ElewiseOp.py` (Add, Mul, etc.), `ReduceOp.py` (ReduceAdd, ReduceRank, etc.), `CompOp.py` (Greater, Less, etc.), `MiscOp.py`.
+- **`passes/`** — All optimization passes and code generation:
+  - `InferWindow.py` — Infers time-window sizes
+  - `SpecialOpt.py` — Domain-specific rewrites (stddev, rank)
+  - `Decompose.py` — Expands windowed ops into `ForeachBackWindow` loops + reductions
+  - `ExprFold.py` — Constant folding and algebraic simplification
+  - `TempWindowElim.py` — Eliminates intermediate window buffers
+  - `MergeLoops.py` — Fuses compatible loops
+  - `Partitioner.py` — Partitions ops into parallel execution blocks
+  - `CodegenCpp.py` — Emits C++ source from the final IR
+- **`jit/cfake.py`** — Invokes MSVC/GCC/Clang to compile generated C++ to a shared library.
+- **`predefined/`** — Ready-to-use factor libraries: `Alpha101.py` (101 factors), `Alpha158.py`.
+- **`runner/`** — Python bindings via pybind11; `KunRunner` loads shared libs, creates executors, runs graphs.
+
+### Optimization Pass Order (in `Driver.optimize()`)
+
+1. InferWindow → SpecialOpt → Decompose → ExprFold → SpecialOpt → ExprFold → DecomposeRank → MoveDupRankOutput → TempWindowElim
+
+Post-compile (`post_optimize()`): TempWindowElim → InferInputWindow → MergeLoops
+
+### C++ Runtime (`cpp/`)
+
+- **`cpp/Kun/`** — Core runtime: `Runtime.cpp` (execution engine), `Executor.cpp`, `Module.cpp`, `CApi.cpp`, `Ops.hpp` (operator implementations), `Rank.hpp`, `Scale.hpp`, `SkipList.cpp` (sorted stream state).
+- **`cpp/KunSIMD/`** — SIMD vector ops for x86 (AVX2/AVX512) and ARM (NEON).
+- **`cpp/Python/`** — pybind11 bindings.
+
+### Memory Layouts
+
+- `TS` (Time-Stock): time is outer dimension — default for batch mode
+- `STs` (Stock-Time-blocked): stocks are outer, time is inner with blocking — better for streaming
+
+Recommended blocking: 8 stocks (float + AVX2), 4 stocks (double + AVX2).
+
+## Typical Usage Pattern
+
+```python
+from KunQuant.Op import Builder, Input, Output
+from KunQuant.ops import *
+from KunQuant.Stage import Function
+from KunQuant.jit import cfake
+from KunQuant.Driver import KunCompilerConfig
+
+with Builder() as b:
+    close = Input("close")
+    # ... define factor expressions ...
+    Output(some_expr, "factor_name")
+
+f = Function(b.ops)
+lib = cfake.compileit([("mylib", f, KunCompilerConfig())], "out_lib", cfake.CppCompilerConfig())
+modu = lib.getModule("mylib")
+
+# Execute
+from KunQuant.runner import KunRunner as kr
+executor = kr.createMultiThreadExecutor(num_threads)
+result = kr.runGraph(executor, modu, input_dict, start_time, num_time)
+```
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25d0dbd..e574887 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_policy(SET CMP0048 NEW)
 project(KunRunner VERSION 0.0.1)
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.20)
 add_subdirectory(3rdparty/pybind11)
 include_directories(${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/cpp)
 
@@ -85,4 +85,35 @@ endif()
 
 message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}")
 
-add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
\ No newline at end of file
+add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
+
+#===------------------------------------------------------------------------===#
+# Optional MLIR backend (kun-opt + kunir/kungpu dialects)
+#===------------------------------------------------------------------------===#
+option(KUN_BUILD_MLIR "Build MLIR backend with kunir/kungpu dialects" OFF)
+if(KUN_BUILD_MLIR)
+  find_package(MLIR REQUIRED CONFIG)
+  message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
+  message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+  set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/bin)
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/lib)
+  set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
+  list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+  list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+
+  include(TableGen)
+  include(AddLLVM)
+  include(AddMLIR)
+  include(HandleLLVMOptions)
+
+  # MLIR requires C++17; replace the existing -std=c++11 flag so all
+  # targets in this build (including the existing runtime) compile as C++17,
+  # which is backwards-compatible with C++11 code.
+  string(REPLACE "-std=c++11" "-std=c++17" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+  add_subdirectory(mlir)
+endif()
\ No newline at end of file
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
new file mode 100644
index 0000000..5d7c7ea
--- /dev/null
+++ b/mlir/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(KUN_MLIR_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(KUN_MLIR_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# Include paths: MLIR/LLVM installed headers, our source headers,
+# and the build dir (for generated .h.inc files from tablegen)
+include_directories(${LLVM_INCLUDE_DIRS})
+include_directories(${MLIR_INCLUDE_DIRS})
+include_directories(${KUN_MLIR_SOURCE_DIR}/include)
+include_directories(${KUN_MLIR_BINARY_DIR}/include)
+
+add_definitions(${LLVM_DEFINITIONS})
+
+# Enable gc-sections so kun-opt pulls in only the code it actually uses
+# from MLIR static libraries, keeping the binary small.
+if(NOT MSVC)
+  add_compile_options(-ffunction-sections -fdata-sections)
+  if(APPLE)
+    string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-dead_strip")
+  else()
+    string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,--gc-sections")
+    string(APPEND CMAKE_SHARED_LINKER_FLAGS " -Wl,--gc-sections")
+  endif()
+endif()
+
+add_subdirectory(include)
+add_subdirectory(lib)
+add_subdirectory(Tools/kun-opt)
+add_subdirectory(test)
diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
new file mode 100644
index 0000000..c0dddc3
--- /dev/null
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_llvm_executable(kun-opt kun-opt.cpp)
+
+llvm_update_compile_flags(kun-opt)
+
+target_link_libraries(kun-opt PRIVATE
+  # KunQuant dialects
+  MLIRKunIrDialect
+  MLIRKunGpuDialect
+
+  # MLIR opt infrastructure
+  MLIROptLib
+
+  # Standard dialects used inside kunir/kungpu IR
+  MLIRFuncDialect
+  MLIRArithDialect
+
+  # Core MLIR libraries
+  MLIRIR
+  MLIRParser
+  MLIRPass
+  MLIRTransforms
+  MLIRSupport
+  MLIRSideEffectInterfaces
+)
+
+# Verify no unexpected MLIR dialects are pulled in transitively
+mlir_check_all_link_libraries(kun-opt)
diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
new file mode 100644
index 0000000..063fefc
--- /dev/null
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -0,0 +1,25 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/KunGpuOps.h"
+#include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrOps.h"
+
+int main(int argc, char **argv) {
+  mlir::DialectRegistry registry;
+
+  // Core dialects used by kunir/kungpu
+  registry.insert<mlir::func::FuncDialect>();
+  registry.insert<mlir::arith::ArithDialect>();
+
+  // KunQuant dialects
+  registry.insert<kunir::KunIrDialect>();
+  registry.insert<kungpu::KunGpuDialect>();
+
+  return mlir::asMainReturnCode(
+      mlir::MlirOptMain(argc, argv, "KunQuant MLIR optimizer\n", registry));
+}
diff --git a/mlir/include/CMakeLists.txt b/mlir/include/CMakeLists.txt
new file mode 100644
index 0000000..66b4ff9
--- /dev/null
+++ b/mlir/include/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(KunIr)
+add_subdirectory(KunGpu)
diff --git a/mlir/include/KunGpu/CMakeLists.txt b/mlir/include/KunGpu/CMakeLists.txt
new file mode 100644
index 0000000..e23e6e5
--- /dev/null
+++ b/mlir/include/KunGpu/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(KunGpuOps kungpu)
diff --git a/mlir/include/KunGpu/KunGpuDialect.h b/mlir/include/KunGpu/KunGpuDialect.h
new file mode 100644
index 0000000..390ea7b
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuDialect.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "mlir/IR/Dialect.h"
+
+// Generated by TableGen
+#include "KunGpu/KunGpuOpsDialect.h.inc"
diff --git a/mlir/include/KunGpu/KunGpuDialect.td b/mlir/include/KunGpu/KunGpuDialect.td
new file mode 100644
index 0000000..5095858
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuDialect.td
@@ -0,0 +1,19 @@
+#ifndef KUNGPU_DIALECT_TD
+#define KUNGPU_DIALECT_TD
+
+include "mlir/IR/OpBase.td"
+
+def KunGpu_Dialect : Dialect {
+  let name = "kungpu";
+  let summary = "KunQuant GPU dialect for NVGPU-targeted computation";
+  let description = [{
+    The kungpu dialect lowers kunir dataflow operations to explicit GPU
+    control flow with an NVIDIA GPU thread model. It retains !kunir.ts
+    types and introduces explicit time loops, stock-thread mapping, and
+    shared memory operations. Raw pointers appear only when lowering to
+    llvm+nvvm dialects.
+  }];
+  let cppNamespace = "::kungpu";
+}
+
+#endif // KUNGPU_DIALECT_TD
diff --git a/mlir/include/KunGpu/KunGpuOps.h b/mlir/include/KunGpu/KunGpuOps.h
new file mode 100644
index 0000000..43f4aea
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuOps.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "KunGpu/KunGpuDialect.h"
+
+// Generated by TableGen
+#define GET_OP_CLASSES
+#include "KunGpu/KunGpuOps.h.inc"
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
new file mode 100644
index 0000000..312fbaa
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -0,0 +1,31 @@
+#ifndef KUNGPU_OPS_TD
+#define KUNGPU_OPS_TD
+
+include "KunGpu/KunGpuDialect.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+class KunGpu_Op<string mnemonic, list<Trait> traits = []>
+    : Op<KunGpu_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Thread/block indexing
+//===----------------------------------------------------------------------===//
+
+def KunGpu_StockIdOp : KunGpu_Op<"stock_id", [Pure]> {
+  let summary = "Get the stock index assigned to this GPU thread";
+  let description = [{
+    Returns the logical stock index for the current GPU thread.
+    Lowers to: blockIdx.x * blockDim.x + threadIdx.x
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+def KunGpu_BlockStockCountOp : KunGpu_Op<"block_stock_count", [Pure]> {
+  let summary = "Number of stocks handled per GPU block (blockDim.x)";
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+#endif // KUNGPU_OPS_TD
diff --git a/mlir/include/KunIr/CMakeLists.txt b/mlir/include/KunIr/CMakeLists.txt
new file mode 100644
index 0000000..dd36893
--- /dev/null
+++ b/mlir/include/KunIr/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(KunIrOps kunir)
diff --git a/mlir/include/KunIr/KunIrDialect.h b/mlir/include/KunIr/KunIrDialect.h
new file mode 100644
index 0000000..d8e7aa5
--- /dev/null
+++ b/mlir/include/KunIr/KunIrDialect.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/Dialect.h"
+
+// Generated by TableGen
+#include "KunIr/KunIrOpsDialect.h.inc"
diff --git a/mlir/include/KunIr/KunIrDialect.td b/mlir/include/KunIr/KunIrDialect.td
new file mode 100644
index 0000000..def6242
--- /dev/null
+++ b/mlir/include/KunIr/KunIrDialect.td
@@ -0,0 +1,21 @@
+#ifndef KUNIR_DIALECT_TD
+#define KUNIR_DIALECT_TD
+
+include "mlir/IR/OpBase.td"
+
+def KunIr_Dialect : Dialect {
+  let name = "kunir";
+  let summary = "KunQuant IR dialect for dataflow financial factor computation";
+  let description = [{
+    The kunir dialect represents financial factor computations as a pure
+    dataflow graph. Operations consume and produce !kunir.ts values; there
+    is no explicit time-dimension iteration or memory access at this level.
+  }];
+  let cppNamespace = "::kunir";
+  let useDefaultTypePrinterParser = 1;
+  let extraClassDeclaration = [{
+    void registerTypes();
+  }];
+}
+
+#endif // KUNIR_DIALECT_TD
diff --git a/mlir/include/KunIr/KunIrOps.h b/mlir/include/KunIr/KunIrOps.h
new file mode 100644
index 0000000..d17203c
--- /dev/null
+++ b/mlir/include/KunIr/KunIrOps.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrTypes.h"
+
+// NativeOpTrait<"Name"> expands to ::mlir::OpTrait::Name in generated code,
+// so the trait class must live in that namespace.  We prefix it with KunIr
+// to avoid collisions with MLIR builtins.
+namespace mlir {
+namespace OpTrait {
+
+/// Trait for element-wise kunir ts ops.
+///
+/// Provides a shared static inferReturnTypes that always yields
+/// !kunir.ts<firstOperand.elemType, 1>.  Ops that carry InferTypeOpInterface
+/// and this trait require no per-op inferReturnTypes definition.
+template <typename ConcreteType>
+class KunIrElemwiseTsResultType
+    : public TraitBase<ConcreteType, KunIrElemwiseTsResultType> {
+public:
+  static mlir::LogicalResult inferReturnTypes(
+      mlir::MLIRContext *ctx, std::optional<mlir::Location>,
+      mlir::ValueRange operands, mlir::DictionaryAttr,
+      mlir::PropertyRef, mlir::RegionRange,
+      llvm::SmallVectorImpl<mlir::Type> &inferred) {
+    auto inputTy = llvm::cast<::kunir::TsType>(operands[0].getType());
+    inferred.push_back(
+        ::kunir::TsType::get(ctx, inputTy.getElementType(), 1));
+    return mlir::success();
+  }
+};
+
+} // namespace OpTrait
+} // namespace mlir
+
+// Convenient alias in the kunir namespace.
+namespace kunir {
+namespace OpTrait {
+template <typename ConcreteType>
+using ElemwiseTsResultType =
+    ::mlir::OpTrait::KunIrElemwiseTsResultType<ConcreteType>;
+} // namespace OpTrait
+} // namespace kunir
+
+// Generated by TableGen
+#define GET_OP_CLASSES
+#include "KunIr/KunIrOps.h.inc"
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
new file mode 100644
index 0000000..540724b
--- /dev/null
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -0,0 +1,215 @@
+#ifndef KUNIR_OPS_TD
+#define KUNIR_OPS_TD
+
+include "KunIr/KunIrDialect.td"
+include "KunIr/KunIrTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+class KunIr_Op<string mnemonic, list<Trait> traits = []>
+    : Op<KunIr_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Element-wise binary ops
+//
+// Inputs may have different maxLookback values but must share the same element
+// type. The result always has maxLookback = 1 (only the current value).
+//===----------------------------------------------------------------------===//
+
+class KunIr_BinaryElemwiseOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        InferTypeOpInterface,
+        NativeOpTrait<"KunIrElemwiseTsResultType">
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$lhs, KunIr_AnyTs:$rhs);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$lhs `,` $rhs `:` type($lhs) `,` type($rhs) attr-dict";
+}
+
+def KunIr_AddOp : KunIr_BinaryElemwiseOp<"add"> {
+  let summary = "Element-wise addition of two time series";
+}
+def KunIr_SubOp : KunIr_BinaryElemwiseOp<"sub"> {
+  let summary = "Element-wise subtraction of two time series";
+}
+def KunIr_MulOp : KunIr_BinaryElemwiseOp<"mul"> {
+  let summary = "Element-wise multiplication of two time series";
+}
+def KunIr_DivOp : KunIr_BinaryElemwiseOp<"div"> {
+  let summary = "Element-wise division of two time series";
+}
+def KunIr_MaxOp : KunIr_BinaryElemwiseOp<"max"> {
+  let summary = "Element-wise maximum of two time series";
+}
+def KunIr_MinOp : KunIr_BinaryElemwiseOp<"min"> {
+  let summary = "Element-wise minimum of two time series";
+}
+
+//===----------------------------------------------------------------------===//
+// Element-wise unary ops
+//
+// The input may have any maxLookback. The result always has maxLookback = 1.
+//===----------------------------------------------------------------------===//
+
+class KunIr_UnaryElemwiseOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        InferTypeOpInterface,
+        NativeOpTrait<"KunIrElemwiseTsResultType">
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$input);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$input `:` type($input) attr-dict";
+}
+
+def KunIr_AbsOp  : KunIr_UnaryElemwiseOp<"abs">  {
+  let summary = "Element-wise absolute value";
+}
+def KunIr_LogOp  : KunIr_UnaryElemwiseOp<"log">  {
+  let summary = "Element-wise natural logarithm";
+}
+def KunIr_SignOp : KunIr_UnaryElemwiseOp<"sign"> {
+  let summary = "Element-wise sign (-1, 0, 1)";
+}
+
+//===----------------------------------------------------------------------===//
+// Cross-sectional ops
+//
+// Operates time-step-by-time-step across all stocks. Result has maxLookback=1.
+//===----------------------------------------------------------------------===//
+
+def KunIr_CsRankOp : KunIr_Op<"cs_rank", [
+    Pure,
+    InferTypeOpInterface,
+    NativeOpTrait<"KunIrElemwiseTsResultType">
+]> {
+  let summary = "Cross-sectional rank of a time series";
+  let description = [{
+    For each time step t, computes the rank of each stock's value among
+    all stocks at time t. Output values are in the range [0, 1].
+    The input may have any maxLookback; the result always has maxLookback = 1.
+  }];
+  let arguments = (ins KunIr_AnyTs:$input);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `:` type($input) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// WindowedOutput op
+//
+// Materialises a lookback window: takes any-length ts and produces a ts
+// with a specific maxLookback.  The element types must match.
+//===----------------------------------------------------------------------===//
+
+def KunIr_WindowedOutputOp : KunIr_Op<"windowed_output", [Pure]> {
+  let summary = "Store a segment of a time-series stream with fixed lookback";
+  let description = [{
+    Takes an input ts with any maxLookback and produces a ts whose
+    maxLookback equals the `length` attribute.  This op is used to
+    explicitly bound how far back subsequent windowed ops may look.
+
+    Constraints:
+      - result element type == input element type
+      - result.maxLookback == length
+  }];
+  let arguments = (ins KunIr_AnyTs:$input, I64Attr:$length);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `[` `length` `=` $length `]` `:` type($input) `->` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Region terminator
+//===----------------------------------------------------------------------===//
+
+def KunIr_YieldOp : KunIr_Op<"yield", [Pure, Terminator, ReturnLike]> {
+  let summary = "Yield zero or more scalar values from a kunir region body";
+  let arguments = (ins Variadic<AnyType>:$values);
+  let assemblyFormat = "($values^ `:` type($values))? attr-dict";
+  // Zero-argument builder for ensureTerminator (empty yield).
+  let builders = [OpBuilder<(ins), [{}]>];
+}
+
+//===----------------------------------------------------------------------===//
+// ForEachBackWindow op
+//
+// Accepts N input ts values, iterates over their shared back window, and
+// produces M ts<elemType, 1> results via kunir.reduce_* ops and kunir.yield.
+//
+// Body block:
+//   - Has exactly N arguments, one per input.
+//   - Block arg i has type !kunir.ts<input_i.elemType, 1>  (the current slice).
+//   - May contain kunir.reduce_* ops that accumulate the block arguments.
+//   - Must terminate with kunir.yield returning M ts<elemType, 1> values,
+//     one per result of this op.
+//
+// Constraints:
+//   - window > 0
+//   - For each input i: input_i.maxLookback >= window (verified)
+//   - body block arg count == inputs count
+//   - results count == yield operands count
+//   - Each result has maxLookback == 1
+//===----------------------------------------------------------------------===//
+
+def KunIr_ForEachBackWindowOp : KunIr_Op<"for_each_back_window", [
+    Pure,
+    SingleBlockImplicitTerminator<"::kunir::YieldOp">
+]> {
+  let summary = "Apply reductions over a sliding back window of N time series";
+  let description = [{
+    For each time step t, iterates over the window [t-window+1 .. t] of every
+    input ts and applies the reductions described in the region body.
+    Each input ts contributes one block argument of type ts<elemType_i, 1>
+    representing the current window slice.
+    kunir.reduce_* ops accumulate those values; kunir.yield collects the
+    results which become the M outputs of this op, each of type ts<elemType, 1>.
+
+    Every input's maxLookback must be >= window (or inf).
+  }];
+  let arguments = (ins Variadic<KunIr_AnyTs>:$inputs, I64Attr:$window);
+  let results = (outs Variadic<KunIr_AnyTs>:$results);
+  let regions = (region SizedRegion<1>:$body);
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Reduction ops (must appear inside a ForEachBackWindow body)
+//
+// Each op takes a !kunir.ts<elemType, 1> value (the current window slice
+// from the enclosing ForEachBackWindow block argument) and returns the
+// accumulated !kunir.ts<elemType, 1> result after the window is traversed.
+// SameOperandsAndResultType enforces that input and result types match.
+//===----------------------------------------------------------------------===//
+
+class KunIr_ReduceOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [Pure, SameOperandsAndResultType])> {
+  let arguments = (ins KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($value) attr-dict";
+}
+
+def KunIr_ReduceAddOp : KunIr_ReduceOp<"reduce_add"> {
+  let summary = "Sum reduction over the back window";
+}
+def KunIr_ReduceMulOp : KunIr_ReduceOp<"reduce_mul"> {
+  let summary = "Product reduction over the back window";
+}
+def KunIr_ReduceMaxOp : KunIr_ReduceOp<"reduce_max"> {
+  let summary = "Maximum reduction over the back window";
+}
+def KunIr_ReduceMinOp : KunIr_ReduceOp<"reduce_min"> {
+  let summary = "Minimum reduction over the back window";
+}
+
+#endif // KUNIR_OPS_TD
diff --git a/mlir/include/KunIr/KunIrTypes.h b/mlir/include/KunIr/KunIrTypes.h
new file mode 100644
index 0000000..eaa424a
--- /dev/null
+++ b/mlir/include/KunIr/KunIrTypes.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+
+// Emit class declarations (storage struct is only forward-declared here;
+// the full definition lives in KunIrTypes.cpp via KunIrOpsTypes.cpp.inc).
+#define GET_TYPEDEF_CLASSES
+#include "KunIr/KunIrOpsTypes.h.inc"
diff --git a/mlir/include/KunIr/KunIrTypes.td b/mlir/include/KunIr/KunIrTypes.td
new file mode 100644
index 0000000..b885bb3
--- /dev/null
+++ b/mlir/include/KunIr/KunIrTypes.td
@@ -0,0 +1,35 @@
+#ifndef KUNIR_TYPES_TD
+#define KUNIR_TYPES_TD
+
+include "KunIr/KunIrDialect.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+
+class KunIr_Type<string name, string typeMnemonic, list<Trait> traits = []>
+    : TypeDef<KunIr_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+def KunIr_TsType : KunIr_Type<"Ts", "ts"> {
+  let summary = "Time series type for financial data";
+  let description = [{
+    Represents a time series of financial data across all stocks.
+    Logically a [num_stocks x num_time] array; both dimensions are
+    determined at runtime. The ts type is a pure value flowing between
+    kunir ops — it carries no load/store semantics at this level.
+
+    The maxLookback parameter (uint64) controls how many past time steps
+    this stream retains:
+      1        — only the current value (no history)
+      N        — up to N past values available for windowed ops
+      UINT64_MAX — unlimited history (printed/parsed as "inf"), used for
+                   function input parameters
+  }];
+  let parameters = (ins "::mlir::Type":$elementType, "uint64_t":$maxLookback);
+  let hasCustomAssemblyFormat = 1;
+}
+
+// Type constraint for use in op definitions
+def KunIr_AnyTs : Type<CPred<"::llvm::isa<::kunir::TsType>($_self)">,
+                        "kunir time series type", "::kunir::TsType">;
+
+#endif // KUNIR_TYPES_TD
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
new file mode 100644
index 0000000..66b4ff9
--- /dev/null
+++ b/mlir/lib/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(KunIr)
+add_subdirectory(KunGpu)
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
new file mode 100644
index 0000000..f0831c5
--- /dev/null
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_dialect_library(MLIRKunGpuDialect
+  KunGpuDialect.cpp
+  KunGpuOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/include
+
+  DEPENDS
+  MLIRKunGpuOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRSideEffectInterfaces
+  MLIRKunIrDialect
+)
diff --git a/mlir/lib/KunGpu/KunGpuDialect.cpp b/mlir/lib/KunGpu/KunGpuDialect.cpp
new file mode 100644
index 0000000..9b2f94d
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuDialect.cpp
@@ -0,0 +1,19 @@
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/KunGpuOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+using namespace mlir;
+using namespace kungpu;
+
+//===----------------------------------------------------------------------===//
+// KunGpu dialect
+//===----------------------------------------------------------------------===//
+
+#include "KunGpu/KunGpuOpsDialect.cpp.inc"
+
+void KunGpuDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "KunGpu/KunGpuOps.cpp.inc"
+  >();
+}
diff --git a/mlir/lib/KunGpu/KunGpuOps.cpp b/mlir/lib/KunGpu/KunGpuOps.cpp
new file mode 100644
index 0000000..75f1bb2
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuOps.cpp
@@ -0,0 +1,11 @@
+#include "KunGpu/KunGpuOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
+
+using namespace mlir;
+using namespace kungpu;
+
+// Emits op class method implementations (verifyInvariantsImpl, print, parse, etc.)
+#define GET_OP_CLASSES
+#include "KunGpu/KunGpuOps.cpp.inc"
diff --git a/mlir/lib/KunIr/CMakeLists.txt b/mlir/lib/KunIr/CMakeLists.txt
new file mode 100644
index 0000000..31b5813
--- /dev/null
+++ b/mlir/lib/KunIr/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_dialect_library(MLIRKunIrDialect
+  KunIrDialect.cpp
+  KunIrTypes.cpp
+  KunIrOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/include
+
+  DEPENDS
+  MLIRKunIrOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRSideEffectInterfaces
+)
diff --git a/mlir/lib/KunIr/KunIrDialect.cpp b/mlir/lib/KunIr/KunIrDialect.cpp
new file mode 100644
index 0000000..9eb1a62
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrDialect.cpp
@@ -0,0 +1,20 @@
+#include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+using namespace mlir;
+using namespace kunir;
+
+//===----------------------------------------------------------------------===//
+// KunIr dialect
+//===----------------------------------------------------------------------===//
+
+#include "KunIr/KunIrOpsDialect.cpp.inc"
+
+void KunIrDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "KunIr/KunIrOps.cpp.inc"
+  >();
+  registerTypes();
+}
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
new file mode 100644
index 0000000..fa58ddb
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -0,0 +1,289 @@
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
+#include <limits>
+
+using namespace mlir;
+using namespace kunir;
+
+static constexpr uint64_t kInfLookback = std::numeric_limits<uint64_t>::max();
+
+//===----------------------------------------------------------------------===//
+// Generated op definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "KunIr/KunIrOps.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// YieldOp — manual zero-arg build (declared by OpBuilder<(ins), [{}]>)
+//===----------------------------------------------------------------------===//
+
+void kunir::YieldOp::build(mlir::OpBuilder &, mlir::OperationState &) {
+  // Empty build: produces a zero-operand yield for ensureTerminator.
+}
+
+//===----------------------------------------------------------------------===//
+// Binary elemwise ops — verify only (inferReturnTypes is in ElemwiseTsResultType)
+//===----------------------------------------------------------------------===//
+
+// Shared verifier: both inputs must share the same element type.
+static LogicalResult verifyBinaryElemwise(Operation *op,
+                                          Value lhs, Value rhs) {
+  auto lhsTy = llvm::cast<TsType>(lhs.getType());
+  auto rhsTy = llvm::cast<TsType>(rhs.getType());
+  if (lhsTy.getElementType() != rhsTy.getElementType())
+    return op->emitOpError("lhs element type '")
+           << lhsTy.getElementType() << "' must match rhs element type '"
+           << rhsTy.getElementType() << "'";
+  return success();
+}
+
+LogicalResult AddOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult SubOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult MulOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult DivOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult MaxOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult MinOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+
+//===----------------------------------------------------------------------===//
+// Unary elemwise ops + CsRankOp — verify only
+//===----------------------------------------------------------------------===//
+
+LogicalResult AbsOp::verify()    { return success(); }
+LogicalResult LogOp::verify()    { return success(); }
+LogicalResult SignOp::verify()   { return success(); }
+LogicalResult CsRankOp::verify() { return success(); }
+
+//===----------------------------------------------------------------------===//
+// WindowedOutputOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult WindowedOutputOp::verify() {
+  auto inputTy  = llvm::cast<TsType>(getInput().getType());
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+
+  if (inputTy.getElementType() != resultTy.getElementType())
+    return emitOpError("result element type '")
+           << resultTy.getElementType()
+           << "' must match input element type '"
+           << inputTy.getElementType() << "'";
+
+  int64_t len = getLength();
+  if (len <= 0)
+    return emitOpError("length must be positive, got ") << len;
+
+  if (resultTy.getMaxLookback() != static_cast<uint64_t>(len))
+    return emitOpError("result maxLookback (")
+           << resultTy.getMaxLookback()
+           << ") must equal length attribute (" << len << ")";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Reduce ops — verify they are inside a ForEachBackWindow body
+//
+// Uses Operation* directly (no template); SameOperandsAndResultType already
+// enforces input == result type, so only the parent check is needed.
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyInsideForEachBackWindow(Operation *op) {
+  if (!llvm::isa_and_nonnull<ForEachBackWindowOp>(op->getParentOp()))
+    return op->emitOpError(
+        "must be directly inside a 'kunir.for_each_back_window' region");
+  return success();
+}
+
+LogicalResult ReduceAddOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceMulOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
+
+//===----------------------------------------------------------------------===//
+// ForEachBackWindowOp — verifier + custom assembly format
+//
+// Format:
+//   %r = kunir.for_each_back_window
+//       (%in0 : !kunir.ts<f32, 10>, %in1 : !kunir.ts<f32, 10>)
+//       [window = 5]
+//       (%cur0 : !kunir.ts<f32, 1>, %cur1 : !kunir.ts<f32, 1>)
+//       -> (!kunir.ts<f32, 1>) {
+//     %s = kunir.reduce_add %cur0 : !kunir.ts<f32, 1>
+//     kunir.yield %s : !kunir.ts<f32, 1>
+//   }
+//===----------------------------------------------------------------------===//
+
+LogicalResult ForEachBackWindowOp::verify() {
+  int64_t win = getWindow();
+  if (win <= 0)
+    return emitOpError("window must be positive, got ") << win;
+
+  auto inputs = getInputs();
+  Block &bodyBlock = getBody().front();
+
+  // Each input's maxLookback must be >= window.
+  for (auto [idx, input] : llvm::enumerate(inputs)) {
+    auto inputTy = llvm::cast<TsType>(input.getType());
+    uint64_t lookback = inputTy.getMaxLookback();
+    if (lookback != kInfLookback && lookback < static_cast<uint64_t>(win))
+      return emitOpError("input #")
+             << idx << " maxLookback (" << lookback
+             << ") must be >= window (" << win << ")";
+  }
+
+  // Block must have exactly one arg per input, typed ts<elemType_i, 1>.
+  if (bodyBlock.getNumArguments() != inputs.size())
+    return emitOpError("body block has ")
+           << bodyBlock.getNumArguments()
+           << " argument(s) but op has " << inputs.size() << " input(s)";
+
+  for (auto [idx, input] : llvm::enumerate(inputs)) {
+    auto inputTy = llvm::cast<TsType>(input.getType());
+    Type expectedArgTy =
+        TsType::get(getContext(), inputTy.getElementType(), 1);
+    Type actualArgTy = bodyBlock.getArgument(idx).getType();
+    if (actualArgTy != expectedArgTy)
+      return emitOpError("body block argument #")
+             << idx << " must have type '" << expectedArgTy
+             << "', got '" << actualArgTy << "'";
+  }
+
+  // Body must terminate with YieldOp.
+  auto yieldOp = llvm::dyn_cast<YieldOp>(bodyBlock.getTerminator());
+  if (!yieldOp)
+    return emitOpError("body must terminate with 'kunir.yield'");
+
+  // results count == yield operands count.
+  unsigned numResults = getNumResults();
+  if (yieldOp.getValues().size() != numResults)
+    return emitOpError("yield operands count (")
+           << yieldOp.getValues().size()
+           << ") must match op results count (" << numResults << ")";
+
+  // Every result and yield operand must be ts<elemType, 1>.
+  for (auto [idx, res] : llvm::enumerate(getResults())) {
+    auto resTy = llvm::dyn_cast<TsType>(res.getType());
+    if (!resTy)
+      return emitOpError("result #") << idx << " must be a kunir ts type";
+    if (resTy.getMaxLookback() != 1)
+      return emitOpError("result #") << idx << " maxLookback must be 1, got "
+             << resTy.getMaxLookback();
+  }
+
+  for (auto [idx, val] : llvm::enumerate(yieldOp.getValues())) {
+    auto valTy = llvm::dyn_cast<TsType>(val.getType());
+    if (!valTy)
+      return emitOpError("yield operand #") << idx << " must be a kunir ts type";
+    if (valTy.getMaxLookback() != 1)
+      return emitOpError("yield operand #") << idx
+             << " maxLookback must be 1, got " << valTy.getMaxLookback();
+    if (val.getType() != getResult(idx).getType())
+      return emitOpError("yield operand #")
+             << idx << " type '" << val.getType()
+             << "' must match result type '" << getResult(idx).getType() << "'";
+  }
+
+  return success();
+}
+
+ParseResult ForEachBackWindowOp::parse(OpAsmParser &parser,
+                                       OperationState &result) {
+  Builder &builder = parser.getBuilder();
+
+  // (%in0 : type0, %in1 : type1, ...)
+  SmallVector<OpAsmParser::UnresolvedOperand> inputOperands;
+  SmallVector<Type> inputTypes;
+  if (parser.parseLParen())
+    return failure();
+  if (parser.parseOptionalRParen().failed()) {
+    do {
+      OpAsmParser::UnresolvedOperand operand;
+      Type type;
+      if (parser.parseOperand(operand) || parser.parseColonType(type))
+        return failure();
+      inputOperands.push_back(operand);
+      inputTypes.push_back(type);
+    } while (parser.parseOptionalComma().succeeded());
+    if (parser.parseRParen())
+      return failure();
+  }
+  if (parser.resolveOperands(inputOperands, inputTypes,
+                             parser.getCurrentLocation(), result.operands))
+    return failure();
+
+  // [window = <integer>]
+  int64_t window;
+  if (parser.parseLSquare() || parser.parseKeyword("window") ||
+      parser.parseEqual() || parser.parseInteger(window) ||
+      parser.parseRSquare())
+    return failure();
+  result.addAttribute("window", builder.getI64IntegerAttr(window));
+
+  // (%cur0 : ts0, %cur1 : ts1, ...)
+  SmallVector<OpAsmParser::Argument> blockArgs;
+  if (parser.parseArgumentList(blockArgs, OpAsmParser::Delimiter::Paren,
+                               /*allowType=*/true, /*allowAttrs=*/false))
+    return failure();
+
+  // -> (types) or -> type
+  SmallVector<Type> resultTypes;
+  if (parser.parseArrow())
+    return failure();
+  if (parser.parseOptionalLParen().succeeded()) {
+    if (parser.parseTypeList(resultTypes) || parser.parseRParen())
+      return failure();
+  } else {
+    Type singleTy;
+    if (parser.parseType(singleTy))
+      return failure();
+    resultTypes.push_back(singleTy);
+  }
+  for (Type t : resultTypes)
+    result.addTypes(t);
+
+  // { body }
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, blockArgs))
+    return failure();
+  ForEachBackWindowOp::ensureTerminator(*body, builder, result.location);
+  return success();
+}
+
+void ForEachBackWindowOp::print(OpAsmPrinter &printer) {
+  Block &bodyBlock = getBody().front();
+
+  // (%in0 : type0, %in1 : type1, ...)
+  printer << " (";
+  llvm::interleaveComma(getInputs(), printer, [&](Value input) {
+    printer << input << " : " << input.getType();
+  });
+  printer << ")";
+
+  printer << " [window = " << getWindow() << "]";
+
+  // (%cur0 : ts0, %cur1 : ts1, ...)
+  printer << " (";
+  llvm::interleaveComma(bodyBlock.getArguments(), printer,
+                        [&](BlockArgument arg) {
+                          printer.printRegionArgument(arg);
+                        });
+  printer << ")";
+
+  // -> (types) or -> type
+  auto resultTypes = getResultTypes();
+  if (resultTypes.size() == 1) {
+    printer << " -> " << resultTypes[0];
+  } else {
+    printer << " -> (";
+    llvm::interleaveComma(resultTypes, printer);
+    printer << ")";
+  }
+
+  // Body (block args already printed above).
+  printer << " ";
+  printer.printRegion(getBody(), /*printEntryBlockArgs=*/false,
+                      /*printBlockTerminators=*/true);
+}
diff --git a/mlir/lib/KunIr/KunIrTypes.cpp b/mlir/lib/KunIr/KunIrTypes.cpp
new file mode 100644
index 0000000..d9f52e9
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrTypes.cpp
@@ -0,0 +1,52 @@
+#include "KunIr/KunIrTypes.h"
+#include "KunIr/KunIrDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include <limits>
+
+using namespace mlir;
+using namespace kunir;
+
+// Emits full TsTypeStorage definition + TypeBase method implementations.
+#define GET_TYPEDEF_CLASSES
+#include "KunIr/KunIrOpsTypes.cpp.inc"
+
+static constexpr uint64_t kInfLookback = std::numeric_limits<uint64_t>::max();
+
+// Custom assembly format: !kunir.ts<elemType, N>  or  !kunir.ts<elemType, inf>
+mlir::Type TsType::parse(mlir::AsmParser &parser) {
+  mlir::Type elemType;
+  uint64_t maxLookback;
+
+  if (parser.parseLess() || parser.parseType(elemType) || parser.parseComma())
+    return {};
+
+  if (parser.parseOptionalKeyword("inf").succeeded()) {
+    maxLookback = kInfLookback;
+  } else {
+    if (parser.parseInteger(maxLookback))
+      return {};
+  }
+
+  if (parser.parseGreater())
+    return {};
+
+  return TsType::get(parser.getContext(), elemType, maxLookback);
+}
+
+void TsType::print(mlir::AsmPrinter &printer) const {
+  printer << "<" << getElementType() << ", ";
+  if (getMaxLookback() == kInfLookback)
+    printer << "inf";
+  else
+    printer << getMaxLookback();
+  printer << ">";
+}
+
+void KunIrDialect::registerTypes() {
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "KunIr/KunIrOpsTypes.cpp.inc"
+  >();
+}
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
new file mode 100644
index 0000000..212d030
--- /dev/null
+++ b/mlir/test/CMakeLists.txt
@@ -0,0 +1,30 @@
+# kun-opt is placed in LLVM_RUNTIME_OUTPUT_INTDIR (= ${CMAKE_BINARY_DIR}/bin)
+set(KUN_OPT_BINARY "${LLVM_RUNTIME_OUTPUT_INTDIR}/kun-opt")
+
+# MLIR_TOOLS_DIR may not be set when building out-of-tree; fall back to
+# the LLVM tools directory which contains FileCheck, mlir-opt, etc.
+if(NOT MLIR_TOOLS_DIR)
+  set(MLIR_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}")
+endif()
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+
+set(KUN_MLIR_TEST_DEPENDS
+  kun-opt
+  FileCheck
+)
+
+add_lit_testsuite(check-kun-mlir "Running KunQuant MLIR regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${KUN_MLIR_TEST_DEPENDS}
+)
+set_target_properties(check-kun-mlir PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(KUN_MLIR ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS ${KUN_MLIR_TEST_DEPENDS}
+)
diff --git a/mlir/test/kungpu/basic.mlir b/mlir/test/kungpu/basic.mlir
new file mode 100644
index 0000000..4e3a15d
--- /dev/null
+++ b/mlir/test/kungpu/basic.mlir
@@ -0,0 +1,16 @@
+// RUN: %kun-opt %s | %FileCheck %s
+// RUN: %kun-opt %s | %kun-opt | %FileCheck %s
+
+// CHECK-LABEL: func.func @test_stock_id
+func.func @test_stock_id() -> index {
+  // CHECK: kungpu.stock_id
+  %id = kungpu.stock_id
+  return %id : index
+}
+
+// CHECK-LABEL: func.func @test_block_stock_count
+func.func @test_block_stock_count() -> index {
+  // CHECK: kungpu.block_stock_count
+  %n = kungpu.block_stock_count
+  return %n : index
+}
diff --git a/mlir/test/kunir/basic.mlir b/mlir/test/kunir/basic.mlir
new file mode 100644
index 0000000..0c4d336
--- /dev/null
+++ b/mlir/test/kunir/basic.mlir
@@ -0,0 +1,131 @@
+// RUN: %kun-opt %s | %FileCheck %s
+// RUN: %kun-opt %s | %kun-opt | %FileCheck %s
+
+// Verify the kunir dialect type and ops parse and round-trip.
+
+// CHECK-LABEL: func.func @test_ts_lookback_type
+func.func @test_ts_lookback_type(
+    // CHECK-SAME: !kunir.ts<f32, inf>
+    %a: !kunir.ts<f32, inf>,
+    // CHECK-SAME: !kunir.ts<f32, 1>
+    %b: !kunir.ts<f32, 1>,
+    // CHECK-SAME: !kunir.ts<f64, 10>
+    %c: !kunir.ts<f64, 10>
+) -> !kunir.ts<f32, 1> {
+  return %b : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_binary_mismatched_lookbacks
+func.func @test_binary_mismatched_lookbacks(
+    %a: !kunir.ts<f32, 5>,
+    %b: !kunir.ts<f32, 10>
+) -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.add
+  // CHECK-SAME: <f32, 5>, <f32, 10>
+  %sum = kunir.add %a, %b : !kunir.ts<f32, 5>, !kunir.ts<f32, 10>
+  // CHECK: kunir.sub
+  %diff = kunir.sub %a, %b : !kunir.ts<f32, 5>, !kunir.ts<f32, 10>
+  // CHECK: kunir.mul
+  %prod = kunir.mul %sum, %diff : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  return %prod : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_unary
+func.func @test_unary(%x: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.abs
+  %a = kunir.abs %x : !kunir.ts<f32, inf>
+  // CHECK: kunir.sign
+  %s = kunir.sign %a : !kunir.ts<f32, 1>
+  return %s : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_windowed_output
+func.func @test_windowed_output(%input: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 10> {
+  // CHECK: kunir.windowed_output
+  // CHECK-SAME: length = 10
+  %out = kunir.windowed_output %input [length = 10] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 10>
+  return %out : !kunir.ts<f32, 10>
+}
+
+// CHECK-LABEL: func.func @test_cs_rank
+func.func @test_cs_rank(%input: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.cs_rank
+  %ranked = kunir.cs_rank %input : !kunir.ts<f32, inf>
+  return %ranked : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_for_each_back_window_single
+// Single input, single result.
+func.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
+    -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.for_each_back_window
+  // CHECK-SAME: [window = 5]
+  %ts_sum = kunir.for_each_back_window
+      (%close : !kunir.ts<f32, 10>) [window = 5]
+      (%close_cur : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>) {
+    // CHECK: kunir.reduce_add
+    %s = kunir.reduce_add %close_cur : !kunir.ts<f32, 1>
+    kunir.yield %s : !kunir.ts<f32, 1>
+  }
+  return %ts_sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_for_each_back_window_multi_input
+// Two inputs, two results (one reduce per input).
+func.func @test_for_each_back_window_multi_input(
+    %close: !kunir.ts<f32, 20>,
+    %vol:   !kunir.ts<f32, 20>)
+    -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+  // CHECK: kunir.for_each_back_window
+  %sum_c, %sum_v = kunir.for_each_back_window
+      (%close : !kunir.ts<f32, 20>, %vol : !kunir.ts<f32, 20>) [window = 10]
+      (%cc : !kunir.ts<f32, 1>, %vc : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+    // CHECK: kunir.reduce_add
+    %sc = kunir.reduce_add %cc : !kunir.ts<f32, 1>
+    // CHECK: kunir.reduce_add
+    %sv = kunir.reduce_add %vc : !kunir.ts<f32, 1>
+    kunir.yield %sc, %sv : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  }
+  return %sum_c, %sum_v : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_for_each_back_window_multi_reduce
+// Single input, multiple reductions → multiple results.
+func.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
+    -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+  %sum_ts, %max_ts = kunir.for_each_back_window
+      (%input : !kunir.ts<f32, 20>) [window = 10]
+      (%val : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+    // CHECK: kunir.reduce_add
+    %s = kunir.reduce_add %val : !kunir.ts<f32, 1>
+    // CHECK: kunir.reduce_max
+    %m = kunir.reduce_max %val : !kunir.ts<f32, 1>
+    kunir.yield %s, %m : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  }
+  return %sum_ts, %max_ts : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_for_each_back_window_inf
+// inf lookback satisfies any window size.
+func.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
+    -> !kunir.ts<f64, 1> {
+  %result = kunir.for_each_back_window
+      (%input : !kunir.ts<f64, inf>) [window = 100]
+      (%val : !kunir.ts<f64, 1>)
+      -> (!kunir.ts<f64, 1>) {
+    %s = kunir.reduce_add %val : !kunir.ts<f64, 1>
+    kunir.yield %s : !kunir.ts<f64, 1>
+  }
+  return %result : !kunir.ts<f64, 1>
+}
+
+// CHECK-LABEL: func.func @test_f64_binary
+func.func @test_f64_binary(%a: !kunir.ts<f64, inf>, %b: !kunir.ts<f64, inf>)
+    -> !kunir.ts<f64, 1> {
+  // CHECK: !kunir.ts<f64
+  %result = kunir.max %a, %b : !kunir.ts<f64, inf>, !kunir.ts<f64, inf>
+  return %result : !kunir.ts<f64, 1>
+}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
new file mode 100644
index 0000000..df3c65b
--- /dev/null
+++ b/mlir/test/lit.cfg.py
@@ -0,0 +1,18 @@
+import os
+import lit.formats
+
+config.name = "KunQuant MLIR Tests"
+config.test_format = lit.formats.ShTest(True)
+config.suffixes = [".mlir"]
+
+config.test_source_root = os.path.dirname(__file__)
+config.test_exec_root = config.obj_root
+
+# Tool substitutions
+config.substitutions.append(("%kun-opt", config.kun_opt))
+config.substitutions.append(
+    ("%FileCheck", os.path.join(config.llvm_tools_dir, "FileCheck"))
+)
+
+# Exclude non-test directories from discovery
+config.excludes = ["CMakeLists.txt", "lit.cfg.py", "lit.site.cfg.py.in"]
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
new file mode 100644
index 0000000..af0d5be
--- /dev/null
+++ b/mlir/test/lit.site.cfg.py.in
@@ -0,0 +1,10 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_BINARY_DIR@")
+config.mlir_tools_dir = lit_config.substitute("@MLIR_TOOLS_DIR@")
+config.kun_opt = lit_config.substitute("@KUN_OPT_BINARY@")
+config.obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")

From 56ad94890983756d5eacd29757a72c0ae52d9e0a Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 6 May 2026 00:08:44 -0700
Subject: [PATCH 02/59] kunir -> kungpu

---
 mlir/Tools/kun-opt/CMakeLists.txt     |   5 +-
 mlir/Tools/kun-opt/kun-opt.cpp        |   8 +
 mlir/include/KunGpu/KunGpuOps.h       |   1 +
 mlir/include/KunGpu/KunGpuOps.td      |  65 +++++
 mlir/include/KunIr/CMakeLists.txt     |   1 +
 mlir/include/KunIr/KunIrInterfaces.h  |   9 +
 mlir/include/KunIr/KunIrInterfaces.td |  70 +++++
 mlir/include/KunIr/KunIrOps.h         |   1 +
 mlir/include/KunIr/KunIrOps.td        |  13 +-
 mlir/include/KunIr/Passes.h           |   5 +
 mlir/lib/KunGpu/CMakeLists.txt        |   1 +
 mlir/lib/KunGpu/KunGpuOps.cpp         |  43 ++++
 mlir/lib/KunIr/CMakeLists.txt         |  28 ++
 mlir/lib/KunIr/KunIrInterfaces.cpp    |   6 +
 mlir/lib/KunIr/KunIrOps.cpp           |  82 ++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp      | 355 ++++++++++++++++++++++++++
 mlir/test/kungpu/basic.mlir           |  28 ++
 mlir/test/kunir/lower_to_kungpu.mlir  | 106 ++++++++
 18 files changed, 823 insertions(+), 4 deletions(-)
 create mode 100644 mlir/include/KunIr/KunIrInterfaces.h
 create mode 100644 mlir/include/KunIr/KunIrInterfaces.td
 create mode 100644 mlir/include/KunIr/Passes.h
 create mode 100644 mlir/lib/KunIr/KunIrInterfaces.cpp
 create mode 100644 mlir/lib/KunIr/KunIrToKunGpu.cpp
 create mode 100644 mlir/test/kunir/lower_to_kungpu.mlir

diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
index c0dddc3..ed343b5 100644
--- a/mlir/Tools/kun-opt/CMakeLists.txt
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -5,9 +5,10 @@ add_llvm_executable(kun-opt kun-opt.cpp)
 llvm_update_compile_flags(kun-opt)
 
 target_link_libraries(kun-opt PRIVATE
-  # KunQuant dialects
+  # KunQuant dialects + passes
   MLIRKunIrDialect
   MLIRKunGpuDialect
+  MLIRKunIrToKunGpu
 
   # MLIR opt infrastructure
   MLIROptLib
@@ -15,6 +16,8 @@ target_link_libraries(kun-opt PRIVATE
   # Standard dialects used inside kunir/kungpu IR
   MLIRFuncDialect
   MLIRArithDialect
+  MLIRMathDialect
+  MLIRSCFDialect
 
   # Core MLIR libraries
   MLIRIR
diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
index 063fefc..52975bf 100644
--- a/mlir/Tools/kun-opt/kun-opt.cpp
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -1,5 +1,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "mlir/Transforms/Passes.h"
@@ -8,6 +10,7 @@
 #include "KunGpu/KunGpuOps.h"
 #include "KunIr/KunIrDialect.h"
 #include "KunIr/KunIrOps.h"
+#include "KunIr/Passes.h"
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
@@ -15,11 +18,16 @@ int main(int argc, char **argv) {
   // Core dialects used by kunir/kungpu
   registry.insert<mlir::func::FuncDialect>();
   registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::math::MathDialect>();
+  registry.insert<mlir::scf::SCFDialect>();
 
   // KunQuant dialects
   registry.insert<kunir::KunIrDialect>();
   registry.insert<kungpu::KunGpuDialect>();
 
+  // KunQuant passes
+  kunir::registerKunIrToKunGpuPass();
+
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "KunQuant MLIR optimizer\n", registry));
 }
diff --git a/mlir/include/KunGpu/KunGpuOps.h b/mlir/include/KunGpu/KunGpuOps.h
index 43f4aea..9d4893d 100644
--- a/mlir/include/KunGpu/KunGpuOps.h
+++ b/mlir/include/KunGpu/KunGpuOps.h
@@ -6,6 +6,7 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "KunGpu/KunGpuDialect.h"
+#include "KunIr/KunIrTypes.h"
 
 // Generated by TableGen
 #define GET_OP_CLASSES
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
index 312fbaa..82fb2e6 100644
--- a/mlir/include/KunGpu/KunGpuOps.td
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -2,6 +2,7 @@
 #define KUNGPU_OPS_TD
 
 include "KunGpu/KunGpuDialect.td"
+include "KunIr/KunIrTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 
@@ -28,4 +29,68 @@ def KunGpu_BlockStockCountOp : KunGpu_Op<"block_stock_count", [Pure]> {
   let assemblyFormat = "attr-dict";
 }
 
+def KunGpu_TimeLengthOp : KunGpu_Op<"time_length", [Pure]> {
+  let summary = "Number of time steps this GPU kernel must process";
+  let description = [{
+    Returns the length of the time dimension the current kernel invocation
+    is responsible for. Each thread iterates over [0, time_length).
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Time-series memory ops
+//
+// kungpu.windowed_temp allocates a per-thread circular (ring) buffer.
+// kungpu.ts.get / kungpu.ts.put provide the bridge between the abstract
+// !kunir.ts<elemType, N> handle and actual scalar element loads/stores.
+// The `ts` operand of get/put must be a function argument or a windowed_temp.
+// The `time` operand is an index into the time dimension; the `result`
+// (or `value` for put) must match the ts element type.
+//===----------------------------------------------------------------------===//
+
+def KunGpu_WindowedTempOp : KunGpu_Op<"windowed_temp", [Pure]> {
+  let summary = "Allocate a per-thread windowed (circular) temporary buffer";
+  let description = [{
+    Allocates a thread-local circular buffer whose element type and window
+    length are encoded in the result type `!kunir.ts<elemType, N>`.
+    The buffer is used as the backing store for a windowed reduction;
+    it must be the `ts` operand of `ts.get` or `ts.put`.
+  }];
+  let results = (outs KunIr_AnyTs:$result);
+  let assemblyFormat = "`:` type($result) attr-dict";
+}
+
+def KunGpu_TsGetOp : KunGpu_Op<"ts.get", [Pure]> {
+  let summary = "Load a scalar element from a time series at a given time index";
+  let description = [{
+    Loads the per-stock element from time series `ts` at time index `time`.
+    The result type must equal the element type of `ts`.
+
+    Example:
+      %v = kungpu.ts.get %close[%t] : !kunir.ts<f32, inf> -> f32
+  }];
+  let arguments = (ins KunIr_AnyTs:$ts, Index:$time);
+  let results = (outs AnyFloat:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$ts `[` $time `]` `:` type($ts) `->` type($result) attr-dict";
+}
+
+def KunGpu_TsPutOp : KunGpu_Op<"ts.put"> {
+  let summary = "Store a scalar value into a time series at a given time index";
+  let description = [{
+    Stores scalar `value` into time series `ts` at time index `time`.
+    `value` must have the same type as the element type of `ts`.
+
+    Example:
+      kungpu.ts.put %out[%t], %v : !kunir.ts<f32, 1>, f32
+  }];
+  let arguments = (ins KunIr_AnyTs:$ts, Index:$time, AnyFloat:$value);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$ts `[` $time `]` `,` $value `:` type($ts) `,` type($value) attr-dict";
+}
+
 #endif // KUNGPU_OPS_TD
diff --git a/mlir/include/KunIr/CMakeLists.txt b/mlir/include/KunIr/CMakeLists.txt
index dd36893..a3c44d7 100644
--- a/mlir/include/KunIr/CMakeLists.txt
+++ b/mlir/include/KunIr/CMakeLists.txt
@@ -1 +1,2 @@
 add_mlir_dialect(KunIrOps kunir)
+add_mlir_interface(KunIrInterfaces)
diff --git a/mlir/include/KunIr/KunIrInterfaces.h b/mlir/include/KunIr/KunIrInterfaces.h
new file mode 100644
index 0000000..ef673cc
--- /dev/null
+++ b/mlir/include/KunIr/KunIrInterfaces.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/StringRef.h"
+
+#define GET_OP_INTERFACE_CLASSES
+#include "KunIr/KunIrInterfaces.h.inc"
diff --git a/mlir/include/KunIr/KunIrInterfaces.td b/mlir/include/KunIr/KunIrInterfaces.td
new file mode 100644
index 0000000..e680468
--- /dev/null
+++ b/mlir/include/KunIr/KunIrInterfaces.td
@@ -0,0 +1,70 @@
+#ifndef KUNIR_INTERFACES_TD
+#define KUNIR_INTERFACES_TD
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// BinaryArithInterface
+// Implemented by binary elemwise ops (add, sub, mul, div, max, min).
+//===----------------------------------------------------------------------===//
+
+def KunIr_BinaryArithInterface : OpInterface<"BinaryArithInterface"> {
+  let cppNamespace = "::kunir";
+  let description = [{
+    Factory interface for binary kunir ops.  Each implementing op knows how to
+    lower itself to its corresponding scalar arith/math op.
+  }];
+  let methods = [
+    InterfaceMethod<
+      "Build the scalar arith/math op corresponding to this binary kunir op.",
+      "::mlir::Value", "buildScalarOp",
+      (ins "::mlir::OpBuilder &":$b, "::mlir::Location":$loc,
+           "::mlir::Value":$lhs, "::mlir::Value":$rhs)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryArithInterface
+// Implemented by unary elemwise ops (abs, log, sign).
+//===----------------------------------------------------------------------===//
+
+def KunIr_UnaryArithInterface : OpInterface<"UnaryArithInterface"> {
+  let cppNamespace = "::kunir";
+  let description = [{
+    Factory interface for unary kunir ops.  Each implementing op knows how to
+    lower itself to its corresponding scalar arith/math op.
+  }];
+  let methods = [
+    InterfaceMethod<
+      "Build the scalar arith/math op corresponding to this unary kunir op.",
+      "::mlir::Value", "buildScalarOp",
+      (ins "::mlir::OpBuilder &":$b, "::mlir::Location":$loc,
+           "::mlir::Value":$operand)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceArithInterface
+// Implemented by reduce_* ops (reduce_add, reduce_mul, reduce_max, reduce_min).
+//===----------------------------------------------------------------------===//
+
+def KunIr_ReduceArithInterface : OpInterface<"ReduceArithInterface"> {
+  let cppNamespace = "::kunir";
+  let description = [{
+    Factory interface for reduction kunir ops.  Provides the identity element
+    for iter_args initialisation and a factory that emits one accumulation step.
+  }];
+  let methods = [
+    InterfaceMethod<
+      "Return the identity (init) value for this reduction as a typed float attribute.",
+      "::mlir::TypedAttr", "getInitValue",
+      (ins "::mlir::FloatType":$elemType)>,
+    InterfaceMethod<
+      "Build one accumulation step: newAcc = reduce_op(oldAcc, element).",
+      "::mlir::Value", "buildAccumOp",
+      (ins "::mlir::OpBuilder &":$b, "::mlir::Location":$loc,
+           "::mlir::Value":$acc, "::mlir::Value":$elem)>
+  ];
+}
+
+#endif // KUNIR_INTERFACES_TD
diff --git a/mlir/include/KunIr/KunIrOps.h b/mlir/include/KunIr/KunIrOps.h
index d17203c..99b7206 100644
--- a/mlir/include/KunIr/KunIrOps.h
+++ b/mlir/include/KunIr/KunIrOps.h
@@ -9,6 +9,7 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrInterfaces.h"
 #include "KunIr/KunIrTypes.h"
 
 // NativeOpTrait<"Name"> expands to ::mlir::OpTrait::Name in generated code,
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 540724b..bce2df2 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -3,6 +3,7 @@
 
 include "KunIr/KunIrDialect.td"
 include "KunIr/KunIrTypes.td"
+include "KunIr/KunIrInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
@@ -22,7 +23,8 @@ class KunIr_BinaryElemwiseOp<string mnemonic, list<Trait> traits = []>
     : KunIr_Op<mnemonic, !listconcat(traits, [
         Pure,
         InferTypeOpInterface,
-        NativeOpTrait<"KunIrElemwiseTsResultType">
+        NativeOpTrait<"KunIrElemwiseTsResultType">,
+        DeclareOpInterfaceMethods<KunIr_BinaryArithInterface>
     ])> {
   let arguments = (ins KunIr_AnyTs:$lhs, KunIr_AnyTs:$rhs);
   let results = (outs KunIr_AnyTs:$result);
@@ -60,7 +62,8 @@ class KunIr_UnaryElemwiseOp<string mnemonic, list<Trait> traits = []>
     : KunIr_Op<mnemonic, !listconcat(traits, [
         Pure,
         InferTypeOpInterface,
-        NativeOpTrait<"KunIrElemwiseTsResultType">
+        NativeOpTrait<"KunIrElemwiseTsResultType">,
+        DeclareOpInterfaceMethods<KunIr_UnaryArithInterface>
     ])> {
   let arguments = (ins KunIr_AnyTs:$input);
   let results = (outs KunIr_AnyTs:$result);
@@ -192,7 +195,11 @@ def KunIr_ForEachBackWindowOp : KunIr_Op<"for_each_back_window", [
 //===----------------------------------------------------------------------===//
 
 class KunIr_ReduceOp<string mnemonic, list<Trait> traits = []>
-    : KunIr_Op<mnemonic, !listconcat(traits, [Pure, SameOperandsAndResultType])> {
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        SameOperandsAndResultType,
+        DeclareOpInterfaceMethods<KunIr_ReduceArithInterface>
+    ])> {
   let arguments = (ins KunIr_AnyTs:$value);
   let results = (outs KunIr_AnyTs:$result);
   let hasVerifier = 1;
diff --git a/mlir/include/KunIr/Passes.h b/mlir/include/KunIr/Passes.h
new file mode 100644
index 0000000..773292c
--- /dev/null
+++ b/mlir/include/KunIr/Passes.h
@@ -0,0 +1,5 @@
+#pragma once
+
+namespace kunir {
+void registerKunIrToKunGpuPass();
+} // namespace kunir
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index f0831c5..8b6d34f 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRKunGpuDialect
 
   DEPENDS
   MLIRKunGpuOpsIncGen
+  MLIRKunIrOpsIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/mlir/lib/KunGpu/KunGpuOps.cpp b/mlir/lib/KunGpu/KunGpuOps.cpp
index 75f1bb2..11b2ac0 100644
--- a/mlir/lib/KunGpu/KunGpuOps.cpp
+++ b/mlir/lib/KunGpu/KunGpuOps.cpp
@@ -1,4 +1,5 @@
 #include "KunGpu/KunGpuOps.h"
+#include "KunIr/KunIrTypes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
@@ -9,3 +10,45 @@ using namespace kungpu;
 // Emits op class method implementations (verifyInvariantsImpl, print, parse, etc.)
 #define GET_OP_CLASSES
 #include "KunGpu/KunGpuOps.cpp.inc"
+
+// The `ts` operand of ts.get and ts.put must be a function argument (block
+// argument of an entry block) or the result of a windowed_temp op.
+static bool isValidTsSource(Value v) {
+  if (isa<BlockArgument>(v))
+    return true;
+  if (auto *def = v.getDefiningOp())
+    return isa<WindowedTempOp>(def);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// TsGetOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TsGetOp::verify() {
+  auto tsTy = llvm::cast<kunir::TsType>(getTs().getType());
+  if (tsTy.getElementType() != getResult().getType())
+    return emitOpError("result type '")
+           << getResult().getType()
+           << "' must match ts element type '" << tsTy.getElementType() << "'";
+  if (!isValidTsSource(getTs()))
+    return emitOpError("ts operand must be a function argument or "
+                       "the result of 'kungpu.windowed_temp'");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TsPutOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TsPutOp::verify() {
+  auto tsTy = llvm::cast<kunir::TsType>(getTs().getType());
+  if (tsTy.getElementType() != getValue().getType())
+    return emitOpError("value type '")
+           << getValue().getType()
+           << "' must match ts element type '" << tsTy.getElementType() << "'";
+  if (!isValidTsSource(getTs()))
+    return emitOpError("ts operand must be a function argument or "
+                       "the result of 'kungpu.windowed_temp'");
+  return success();
+}
diff --git a/mlir/lib/KunIr/CMakeLists.txt b/mlir/lib/KunIr/CMakeLists.txt
index 31b5813..317f4a0 100644
--- a/mlir/lib/KunIr/CMakeLists.txt
+++ b/mlir/lib/KunIr/CMakeLists.txt
@@ -1,15 +1,43 @@
 add_mlir_dialect_library(MLIRKunIrDialect
   KunIrDialect.cpp
   KunIrTypes.cpp
+  KunIrInterfaces.cpp
   KunIrOps.cpp
 
+  PARTIAL_SOURCES_INTENDED
+
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/mlir/include
 
   DEPENDS
   MLIRKunIrOpsIncGen
+  MLIRKunIrInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRSideEffectInterfaces
 )
+
+add_mlir_library(MLIRKunIrToKunGpu
+  KunIrToKunGpu.cpp
+
+  PARTIAL_SOURCES_INTENDED
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/include
+
+  DEPENDS
+  MLIRKunIrOpsIncGen
+  MLIRKunIrInterfacesIncGen
+  MLIRKunGpuOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRKunIrDialect
+  MLIRKunGpuDialect
+  MLIRFuncDialect
+  MLIRArithDialect
+  MLIRMathDialect
+  MLIRSCFDialect
+  MLIRIR
+  MLIRPass
+)
diff --git a/mlir/lib/KunIr/KunIrInterfaces.cpp b/mlir/lib/KunIr/KunIrInterfaces.cpp
new file mode 100644
index 0000000..9e6b4d6
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrInterfaces.cpp
@@ -0,0 +1,6 @@
+#include "KunIr/KunIrInterfaces.h"
+
+using namespace mlir;
+using namespace kunir;
+
+#include "KunIr/KunIrInterfaces.cpp.inc"
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index fa58ddb..e17378a 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -1,6 +1,10 @@
 #include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrInterfaces.h"
 #include "KunIr/KunIrTypes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include <limits>
@@ -10,6 +14,12 @@ using namespace kunir;
 
 static constexpr uint64_t kInfLookback = std::numeric_limits<uint64_t>::max();
 
+//===----------------------------------------------------------------------===//
+// Interface table (generated)
+//===----------------------------------------------------------------------===//
+
+#include "KunIr/KunIrInterfaces.cpp.inc"
+
 //===----------------------------------------------------------------------===//
 // Generated op definitions
 //===----------------------------------------------------------------------===//
@@ -287,3 +297,75 @@ void ForEachBackWindowOp::print(OpAsmPrinter &printer) {
   printer.printRegion(getBody(), /*printEntryBlockArgs=*/false,
                       /*printBlockTerminators=*/true);
 }
+
+//===----------------------------------------------------------------------===//
+// BinaryArithInterface implementations
+//===----------------------------------------------------------------------===//
+
+Value AddOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::AddFOp>(loc, lhs, rhs);
+}
+Value SubOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::SubFOp>(loc, lhs, rhs);
+}
+Value MulOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::MulFOp>(loc, lhs, rhs);
+}
+Value DivOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::DivFOp>(loc, lhs, rhs);
+}
+Value MaxOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::MaximumFOp>(loc, lhs, rhs);
+}
+Value MinOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::MinimumFOp>(loc, lhs, rhs);
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryArithInterface implementations
+//===----------------------------------------------------------------------===//
+
+Value AbsOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return b.create<math::AbsFOp>(loc, operand);
+}
+Value LogOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return b.create<math::LogOp>(loc, operand);
+}
+Value SignOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  // sign(x) ≈ copysign(1.0, x)
+  Value one = b.create<arith::ConstantOp>(
+      loc, operand.getType(), b.getFloatAttr(operand.getType(), 1.0));
+  return b.create<math::CopySignOp>(loc, one, operand);
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceArithInterface implementations
+//===----------------------------------------------------------------------===//
+
+TypedAttr ReduceAddOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, 0.0);
+}
+Value ReduceAddOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return b.create<arith::AddFOp>(loc, acc, elem);
+}
+
+TypedAttr ReduceMulOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, 1.0);
+}
+Value ReduceMulOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return b.create<arith::MulFOp>(loc, acc, elem);
+}
+
+TypedAttr ReduceMaxOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, -std::numeric_limits<double>::infinity());
+}
+Value ReduceMaxOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return b.create<arith::MaximumFOp>(loc, acc, elem);
+}
+
+TypedAttr ReduceMinOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, std::numeric_limits<double>::infinity());
+}
+Value ReduceMinOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return b.create<arith::MinimumFOp>(loc, acc, elem);
+}
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
new file mode 100644
index 0000000..4f732ba
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -0,0 +1,355 @@
+//===- KunIrToKunGpu.cpp - Lower kunir ops to kungpu + scf + arith --------===//
+//
+// Lowers a func.func whose body contains kunir ops into a form that uses:
+//   - kungpu.time_length / kungpu.ts.get / kungpu.ts.put  for ts I/O
+//   - scf.for for the outer time loop and inner back-window loops
+//   - arith.* / math.* for scalar arithmetic
+//
+// Assumptions / limitations:
+//   - The function body is a single block.
+//   - ts-typed return values are converted to output parameters (void return).
+//   - All inputs to kunir.for_each_back_window must be ts handles (function
+//     arguments or kunir.windowed_output results).
+//   - Each yield operand of for_each_back_window must come from a reduce_* op.
+//   - kunir.cs_rank is not yet supported.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunGpu/KunGpuOps.h"
+#include "KunIr/KunIrInterfaces.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+using namespace kunir;
+using namespace kungpu;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Value tracking: is a kunir ts value a ts handle or a scalar?
+//
+// HANDLE — the mapped Value is a ts memory object (!kunir.ts<*>); loading it
+//          via ts.get at a given time index yields the element scalar.
+// SCALAR — the mapped Value is an already-computed float scalar.
+//===----------------------------------------------------------------------===//
+
+enum class TsKind { Handle, Scalar };
+struct TsEntry { TsKind kind; Value value; };
+using TsMap = llvm::DenseMap<Value, TsEntry>;
+
+// If `v` is mapped as a Handle in tsMap, emit ts.get(handle, timeIdx) and
+// promote the entry to Scalar.  Returns the scalar value.
+static Value getScalar(Value v, TsMap &tsMap, Value timeIdx,
+                       OpBuilder &b, Location loc) {
+  auto it = tsMap.find(v);
+  assert(it != tsMap.end() && "value not found in tsMap");
+  if (it->second.kind == TsKind::Scalar)
+    return it->second.value;
+  auto tsTy = llvm::cast<TsType>(v.getType());
+  Value scalar = b.create<TsGetOp>(loc, tsTy.getElementType(),
+                                    it->second.value, timeIdx);
+  it->second = {TsKind::Scalar, scalar};
+  return scalar;
+}
+
+// Lower non-terminator ops in `ops` in sequential (definition) order.
+//
+// For each op:
+//   - BinaryArithInterface: emit scalar binary op, record result as Scalar.
+//   - UnaryArithInterface:  emit scalar unary op,  record result as Scalar.
+//   - ReduceArithInterface: caller must pre-seed the op's result in tsMap with
+//     the current accumulator (iterArg).  This function emits the accumulation
+//     step and updates the tsMap entry to the new accumulator.
+//   - Anything else: call handleUnknown if provided, else return failure.
+//
+// Handle-typed operands are loaded via ts.get (getScalar) on first use.
+static LogicalResult lowerBlock(
+    llvm::ArrayRef<Operation *> ops,
+    TsMap &tsMap, Value timeIdx, OpBuilder &b, Location loc,
+    llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
+  for (Operation *op : ops) {
+    Location ol = op->getLoc();
+    if (auto iface = dyn_cast<BinaryArithInterface>(op)) {
+      Value lhs = getScalar(op->getOperand(0), tsMap, timeIdx, b, ol);
+      Value rhs = getScalar(op->getOperand(1), tsMap, timeIdx, b, ol);
+      tsMap[op->getResult(0)] = {TsKind::Scalar,
+          iface.buildScalarOp(b, ol, lhs, rhs)};
+    } else if (auto iface = dyn_cast<UnaryArithInterface>(op)) {
+      Value operand = getScalar(op->getOperand(0), tsMap, timeIdx, b, ol);
+      tsMap[op->getResult(0)] = {TsKind::Scalar,
+          iface.buildScalarOp(b, ol, operand)};
+    } else if (auto ri = dyn_cast<ReduceArithInterface>(op)) {
+      Value elem = getScalar(op->getOperand(0), tsMap, timeIdx, b, ol);
+      auto it = tsMap.find(op->getResult(0));
+      assert(it != tsMap.end() && it->second.kind == TsKind::Scalar
+             && "reduce result must be pre-seeded in tsMap with current acc");
+      it->second = {TsKind::Scalar,
+          ri.buildAccumOp(b, ol, it->second.value, elem)};
+    } else if (handleUnknown) {
+      if (failed(handleUnknown(*op))) return failure();
+    } else {
+      return op->emitError("kunir-to-kungpu: cannot lower op in block");
+    }
+  }
+  return success();
+}
+
+// Overload that collects non-terminator ops from `block` and delegates.
+static LogicalResult lowerBlock(
+    Block &block,
+    TsMap &tsMap, Value timeIdx, OpBuilder &b, Location loc,
+    llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
+  SmallVector<Operation *> ops;
+  for (Operation &op : block.without_terminator())
+    ops.push_back(&op);
+  return lowerBlock(ops, tsMap, timeIdx, b, loc, handleUnknown);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass definition
+//===----------------------------------------------------------------------===//
+
+struct LowerKunIrToKunGpuPass
+    : PassWrapper<LowerKunIrToKunGpuPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerKunIrToKunGpuPass)
+  StringRef getArgument()    const override { return "kunir-to-kungpu"; }
+  StringRef getDescription() const override {
+    return "Lower kunir ops to kungpu + scf + arith/math"; }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<kungpu::KunGpuDialect>();
+    registry.insert<arith::ArithDialect>();
+    registry.insert<math::MathDialect>();
+    registry.insert<scf::SCFDialect>();
+  }
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void LowerKunIrToKunGpuPass::runOnOperation() {
+  func::FuncOp funcOp = getOperation();
+  MLIRContext *ctx = &getContext();
+  Location loc = funcOp.getLoc();
+
+  Block &entry = funcOp.getBody().front();
+
+  // ------------------------------------------------------------------
+  // 1. Extend function signature: ts return types → extra output params.
+  // ------------------------------------------------------------------
+  FunctionType oldFT = funcOp.getFunctionType();
+  SmallVector<Type> newArgTys(oldFT.getInputs());
+  SmallVector<unsigned> tsRetIdx;
+  for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
+    if (isa<TsType>(ty)) tsRetIdx.push_back(i);
+
+  SmallVector<Value> outParams;
+  for (unsigned i : tsRetIdx) {
+    outParams.push_back(entry.addArgument(oldFT.getResult(i), loc));
+    newArgTys.push_back(oldFT.getResult(i));
+  }
+  SmallVector<Type> newRetTys;
+  for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
+    if (!isa<TsType>(ty)) newRetTys.push_back(ty);
+  funcOp.setFunctionType(FunctionType::get(ctx, newArgTys, newRetTys));
+
+  // ------------------------------------------------------------------
+  // 2. Snapshot original ops and find the original return.
+  // ------------------------------------------------------------------
+  SmallVector<Operation *> origOps;
+  func::ReturnOp retOp;
+  for (Operation &op : entry) origOps.push_back(&op);
+  for (Operation *op : origOps)
+    if (auto r = dyn_cast<func::ReturnOp>(op)) { retOp = r; break; }
+
+  // Collect ts return values from the original return.
+  SmallVector<Value> tsRetVals;
+  if (retOp)
+    for (Value v : retOp.getOperands())
+      if (isa<TsType>(v.getType())) tsRetVals.push_back(v);
+  assert(tsRetVals.size() == outParams.size());
+
+  // ------------------------------------------------------------------
+  // 3. Insert outer scf.for loop before the first original op.
+  //    windowed_temp ops are inserted before this loop (via `b`).
+  // ------------------------------------------------------------------
+  OpBuilder b(ctx);
+  b.setInsertionPoint(origOps.front());
+
+  Value timeLen = b.create<TimeLengthOp>(loc, b.getIndexType());
+  Value c0 = b.create<arith::ConstantIndexOp>(loc, 0);
+  Value c1 = b.create<arith::ConstantIndexOp>(loc, 1);
+  auto outerFor = b.create<scf::ForOp>(loc, c0, timeLen, c1);
+  Value t = outerFor.getInductionVar();
+
+  // Erase the implicit empty scf.yield (no iter_args → zero-operand yield).
+  outerFor.getBody()->back().erase();
+  OpBuilder fb = OpBuilder::atBlockEnd(outerFor.getBody());
+
+  // Point b before outerFor so windowed_temp ops land outside the time loop.
+  b.setInsertionPoint(outerFor);
+
+  // ------------------------------------------------------------------
+  // 4. Seed tsMap: each ts-typed function argument is a Handle.
+  // ------------------------------------------------------------------
+  TsMap tsMap;
+  unsigned numOrigArgs = oldFT.getNumInputs();
+  for (unsigned i = 0; i < numOrigArgs; ++i) {
+    Value arg = entry.getArgument(i);
+    if (isa<TsType>(arg.getType()))
+      tsMap[arg] = {TsKind::Handle, arg};
+  }
+
+  // ------------------------------------------------------------------
+  // 5. Lower original ops in definition order.
+  //
+  //    lowerBlock handles binary/unary/reduce ops.  windowed_output,
+  //    for_each_back_window, and func.return are handled by the callback.
+  // ------------------------------------------------------------------
+  auto outerHandler = [&](Operation &op) -> LogicalResult {
+    if (isa<func::ReturnOp>(op)) return success(); // handled in step 7
+
+    Location ol = op.getLoc();
+
+    // windowed_output → allocate windowed_temp outside the loop,
+    //                   fill circular buffer at each time step inside.
+    if (auto woOp = dyn_cast<WindowedOutputOp>(op)) {
+      auto wt = b.create<WindowedTempOp>(ol, woOp.getResult().getType());
+      tsMap[woOp.getResult()] = {TsKind::Handle, wt.getResult()};
+      Value inputScalar = getScalar(woOp.getInput(), tsMap, t, fb, ol);
+      fb.create<TsPutOp>(ol, wt.getResult(), t, inputScalar);
+      return success();
+    }
+
+    // for_each_back_window → nested scf.for with iter_args.
+    if (auto fwOp = dyn_cast<ForEachBackWindowOp>(op)) {
+      int64_t window = fwOp.getWindow();
+      Block &body = fwOp.getBody().front();
+      auto yieldOp = llvm::cast<YieldOp>(body.getTerminator());
+
+      // Resolve inputs to ts handles.
+      SmallVector<Value> inputHandles(fwOp.getInputs().size());
+      for (auto [i, inp] : llvm::enumerate(fwOp.getInputs())) {
+        auto it = tsMap.find(inp);
+        if (it == tsMap.end() || it->second.kind != TsKind::Handle) {
+          return op.emitError("kunir-to-kungpu: for_each_back_window input "
+                              "must be a ts handle");
+        }
+        inputHandles[i] = it->second.value;
+      }
+
+      // Each yield operand must come from a reduce_* op — collect init values.
+      SmallVector<ReduceArithInterface> reduces;
+      SmallVector<Value> initVals;
+      for (Value yv : yieldOp.getValues()) {
+        auto *defOp = yv.getDefiningOp();
+        auto ri = defOp ? dyn_cast<ReduceArithInterface>(defOp)
+                        : ReduceArithInterface{};
+        if (!ri) {
+          return op.emitError("kunir-to-kungpu: for_each_back_window yield "
+                              "operand must come from a reduce_* op");
+        }
+        auto elemTy = llvm::cast<FloatType>(
+            llvm::cast<TsType>(defOp->getOperand(0).getType()).getElementType());
+        initVals.push_back(fb.create<arith::ConstantOp>(ol, ri.getInitValue(elemTy)));
+        reduces.push_back(ri);
+      }
+
+      // Create inner scf.for %w = 0 to window step 1 iter_args(acc_i = init_i).
+      // The lambda form lets us emit a proper scf.yield as the body terminator
+      // without fighting the implicit yield created by ensureTerminator.
+      Value wBound = fb.create<arith::ConstantIndexOp>(ol, window);
+      Value wM1    = fb.create<arith::ConstantIndexOp>(ol, window - 1);
+
+      // Capture lowerBlock result since the lambda can't return LogicalResult.
+      bool innerOk = true;
+      auto innerFor = fb.create<scf::ForOp>(
+          ol, c0, wBound, c1, initVals,
+          [&](OpBuilder &ib, Location il, Value w, ValueRange iterArgs) {
+            // elemIdx = t - (window - 1) + w
+            Value base    = ib.create<arith::SubIOp>(il, t, wM1);
+            Value elemIdx = ib.create<arith::AddIOp>(il, base, w);
+
+            // Seed innerTsMap: block args as handles; reduce results as acc.
+            TsMap innerTsMap;
+            for (auto [i, arg] : llvm::enumerate(body.getArguments()))
+              innerTsMap[arg] = {TsKind::Handle, inputHandles[i]};
+            for (auto [i, yv] : llvm::enumerate(yieldOp.getValues()))
+              innerTsMap[yv.getDefiningOp()->getResult(0)] = {TsKind::Scalar,
+                                                              iterArgs[i]};
+
+            if (failed(lowerBlock(body, innerTsMap, elemIdx, ib, il))) {
+              innerOk = false;
+              ib.create<scf::YieldOp>(il, initVals); // keep IR structurally valid
+              return;
+            }
+
+            SmallVector<Value> newAccs;
+            for (Value yv : yieldOp.getValues())
+              newAccs.push_back(innerTsMap.find(yv)->second.value);
+            ib.create<scf::YieldOp>(il, newAccs);
+          });
+      if (!innerOk) return failure();
+
+      // Map for_each_back_window results to the inner for's results.
+      for (auto [i, res] : llvm::enumerate(fwOp.getResults()))
+        tsMap[res] = {TsKind::Scalar, innerFor.getResult(i)};
+      return success();
+    }
+
+    if (isa<CsRankOp>(op)) {
+      return op.emitError("kunir-to-kungpu: cs_rank lowering not yet implemented");
+    }
+    return op.emitError("kunir-to-kungpu: unhandled op in outer block");
+  };
+
+  if (failed(lowerBlock(origOps, tsMap, t, fb, loc, outerHandler)))
+    return signalPassFailure();
+
+  // ------------------------------------------------------------------
+  // 6. Emit ts.put for each ts return value, then close the outer for.
+  // ------------------------------------------------------------------
+  for (auto [outParam, rv] : llvm::zip(outParams, tsRetVals)) {
+    auto it = tsMap.find(rv);
+    assert(it != tsMap.end() && it->second.kind == TsKind::Scalar);
+    fb.create<TsPutOp>(loc, outParam, t, it->second.value);
+  }
+  fb.create<scf::YieldOp>(loc);
+
+  // ------------------------------------------------------------------
+  // 7. Insert a replacement return before the original return op.
+  // ------------------------------------------------------------------
+  if (retOp) {
+    b.setInsertionPoint(retOp);
+    SmallVector<Value> nonTsRets;
+    for (Value v : retOp.getOperands())
+      if (!isa<TsType>(v.getType())) nonTsRets.push_back(v);
+    b.create<func::ReturnOp>(loc, nonTsRets);
+  }
+
+  // ------------------------------------------------------------------
+  // 8. Erase original ops in reverse order.
+  // ------------------------------------------------------------------
+  for (Operation *op : llvm::reverse(origOps))
+    op->erase();
+}
+
+//===----------------------------------------------------------------------===//
+// Pass registration
+//===----------------------------------------------------------------------===//
+
+namespace kunir {
+void registerKunIrToKunGpuPass() {
+  PassRegistration<LowerKunIrToKunGpuPass>();
+}
+} // namespace kunir
diff --git a/mlir/test/kungpu/basic.mlir b/mlir/test/kungpu/basic.mlir
index 4e3a15d..73469d1 100644
--- a/mlir/test/kungpu/basic.mlir
+++ b/mlir/test/kungpu/basic.mlir
@@ -14,3 +14,31 @@ func.func @test_block_stock_count() -> index {
   %n = kungpu.block_stock_count
   return %n : index
 }
+
+// CHECK-LABEL: func.func @test_time_length
+func.func @test_time_length() -> index {
+  // CHECK: kungpu.time_length
+  %len = kungpu.time_length
+  return %len : index
+}
+
+// CHECK-LABEL: func.func @test_ts_get_put
+func.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32, 1>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: kungpu.ts.get
+  // CHECK-SAME: <f32, inf> -> f32
+  %v = kungpu.ts.get %ts_in[%c0] : !kunir.ts<f32, inf> -> f32
+  // CHECK: kungpu.ts.put
+  kungpu.ts.put %ts_out[%c0], %v : !kunir.ts<f32, 1>, f32
+  return
+}
+
+// CHECK-LABEL: func.func @test_windowed_temp
+func.func @test_windowed_temp() -> f32 {
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
+  %wt = kungpu.windowed_temp : !kunir.ts<f32, 5>
+  // CHECK: kungpu.ts.get %[[WT]]
+  %v = kungpu.ts.get %wt[%c0] : !kunir.ts<f32, 5> -> f32
+  return %v : f32
+}
diff --git a/mlir/test/kunir/lower_to_kungpu.mlir b/mlir/test/kunir/lower_to_kungpu.mlir
new file mode 100644
index 0000000..f5f2994
--- /dev/null
+++ b/mlir/test/kunir/lower_to_kungpu.mlir
@@ -0,0 +1,106 @@
+// RUN: %kun-opt --kunir-to-kungpu %s | %FileCheck %s
+
+// CHECK-LABEL: func.func @test_binary_lower
+// CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-SAME: !kunir.ts<f32, 1>
+// CHECK-NOT: -> !kunir.ts
+func.func @test_binary_lower(
+    %a: !kunir.ts<f32, inf>,
+    %b: !kunir.ts<f32, inf>
+) -> !kunir.ts<f32, 1> {
+  // CHECK:      %[[TL:.*]] = kungpu.time_length
+  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
+  // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // CHECK:      scf.for %{{.*}} = %[[C0]] to %[[TL]] step %[[C1]]
+  // CHECK:        kungpu.ts.get
+  // CHECK:        kungpu.ts.get
+  // CHECK:        arith.addf
+  // CHECK:        kungpu.ts.put
+  %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  return %sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_unary_lower
+func.func @test_unary_lower(%x: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+  // CHECK: math.absf
+  %a = kunir.abs %x : !kunir.ts<f32, inf>
+  return %a : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_windowed_sum
+func.func @test_windowed_sum(%close: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
+  // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
+  // CHECK:      scf.for %[[T:.*]] =
+  // CHECK:        kungpu.ts.get %arg0[%[[T]]]
+  // CHECK:        kungpu.ts.put %[[WT]][%[[T]]]
+  // CHECK:        %[[WIN:.*]] = arith.constant 5 : index
+  // CHECK:        scf.for %{{.*}} = %[[C0]] to %[[WIN]] step %[[C1]] iter_args
+  // CHECK:          kungpu.ts.get %[[WT]]
+  // CHECK:          arith.addf
+  %w = kunir.windowed_output %close [length = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 5>
+  %sum = kunir.for_each_back_window
+      (%w : !kunir.ts<f32, 5>) [window = 5]
+      (%cur : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>) {
+    %s = kunir.reduce_add %cur : !kunir.ts<f32, 1>
+    kunir.yield %s : !kunir.ts<f32, 1>
+  }
+  return %sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_computed_reduce
+func.func @test_computed_reduce(
+    %x: !kunir.ts<f32, inf>,
+    %y: !kunir.ts<f32, inf>
+) -> !kunir.ts<f32, 1> {
+  // CHECK:      %[[WX:.*]] = kungpu.windowed_temp : <f32, 3>
+  // CHECK:      %[[WY:.*]] = kungpu.windowed_temp : <f32, 3>
+  // CHECK:      scf.for
+  // CHECK:        scf.for {{.*}} iter_args
+  // CHECK:          %[[A:.*]] = kungpu.ts.get %[[WX]]
+  // CHECK:          %[[B:.*]] = kungpu.ts.get %[[WY]]
+  // CHECK:          %[[P:.*]] = arith.mulf %[[A]], %[[B]]
+  // CHECK:          arith.addf {{.*}}, %[[P]]
+  %wx = kunir.windowed_output %x [length = 3] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 3>
+  %wy = kunir.windowed_output %y [length = 3] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 3>
+  %sum = kunir.for_each_back_window
+      (%wx : !kunir.ts<f32, 3>, %wy : !kunir.ts<f32, 3>) [window = 3]
+      (%a : !kunir.ts<f32, 1>, %b : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>) {
+    %prod = kunir.mul %a, %b : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+    %s = kunir.reduce_add %prod : !kunir.ts<f32, 1>
+    kunir.yield %s : !kunir.ts<f32, 1>
+  }
+  return %sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: func.func @test_multi_reduce
+// CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f64, inf>, %[[OUT0:.*]]: !kunir.ts<f64, 1>, %[[OUT1:.*]]: !kunir.ts<f64, 1>)
+func.func @test_multi_reduce(%input: !kunir.ts<f64, inf>) -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+  // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f64, 10>
+  // CHECK:      scf.for %[[T:.*]] =
+  // CHECK:        kungpu.ts.get %[[IN]][%[[T]]]
+  // CHECK:        kungpu.ts.put %[[WT]][%[[T]]]
+  // CHECK:        %[[CST0:.*]] = arith.constant 0.0{{.*}} : f64
+  // CHECK:        %[[NEGINF:.*]] = arith.constant 0xFFF0000000000000 : f64
+  // CHECK:        %[[R:.*]]:2 = scf.for {{.*}} iter_args(%{{.*}} = %[[CST0]], %{{.*}} = %[[NEGINF]]) -> (f64, f64)
+  // CHECK:          kungpu.ts.get %[[WT]]
+  // CHECK:          arith.addf
+  // CHECK:          arith.maximumf
+  // CHECK:          scf.yield {{.*}}, {{.*}} : f64, f64
+  // CHECK:        kungpu.ts.put %[[OUT0]][%[[T]]], %[[R]]#0 : <f64, 1>, f64
+  // CHECK:        kungpu.ts.put %[[OUT1]][%[[T]]], %[[R]]#1 : <f64, 1>, f64
+  %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
+  %sum, %max = kunir.for_each_back_window
+      (%w : !kunir.ts<f64, 10>) [window = 10]
+      (%val : !kunir.ts<f64, 1>)
+      -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+    %s = kunir.reduce_add %val : !kunir.ts<f64, 1>
+    %m = kunir.reduce_max %val : !kunir.ts<f64, 1>
+    kunir.yield %s, %m : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+  }
+  return %sum, %max : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+}

From fe88c9f8cd579c349ba9d6a5ec1c6101cd77c4ec Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 6 May 2026 01:52:10 -0700
Subject: [PATCH 03/59] memory planning

---
 mlir/Tools/kun-opt/kun-opt.cpp           |   3 +-
 mlir/include/KunGpu/CMakeLists.txt       |   4 +
 mlir/include/KunGpu/Passes.h             |  25 ++++
 mlir/include/KunGpu/Passes.td            |  32 +++++
 mlir/lib/KunGpu/CMakeLists.txt           |   4 +
 mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp | 171 +++++++++++++++++++++++
 mlir/test/kungpu/memory_planning.mlir    |  65 +++++++++
 7 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 mlir/include/KunGpu/Passes.h
 create mode 100644 mlir/include/KunGpu/Passes.td
 create mode 100644 mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
 create mode 100644 mlir/test/kungpu/memory_planning.mlir

diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
index 52975bf..1c818a3 100644
--- a/mlir/Tools/kun-opt/kun-opt.cpp
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -7,7 +7,7 @@
 #include "mlir/Transforms/Passes.h"
 
 #include "KunGpu/KunGpuDialect.h"
-#include "KunGpu/KunGpuOps.h"
+#include "KunGpu/Passes.h"
 #include "KunIr/KunIrDialect.h"
 #include "KunIr/KunIrOps.h"
 #include "KunIr/Passes.h"
@@ -27,6 +27,7 @@ int main(int argc, char **argv) {
 
   // KunQuant passes
   kunir::registerKunIrToKunGpuPass();
+  kungpu::registerKunGpuPasses();
 
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "KunQuant MLIR optimizer\n", registry));
diff --git a/mlir/include/KunGpu/CMakeLists.txt b/mlir/include/KunGpu/CMakeLists.txt
index e23e6e5..4ea697a 100644
--- a/mlir/include/KunGpu/CMakeLists.txt
+++ b/mlir/include/KunGpu/CMakeLists.txt
@@ -1 +1,5 @@
 add_mlir_dialect(KunGpuOps kungpu)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name KunGpu)
+add_mlir_dialect_tablegen_target(MLIRKunGpuPassIncGen)
diff --git a/mlir/include/KunGpu/Passes.h b/mlir/include/KunGpu/Passes.h
new file mode 100644
index 0000000..d28100d
--- /dev/null
+++ b/mlir/include/KunGpu/Passes.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "mlir/Pass/Pass.h"
+#include <cstdint>
+#include <memory>
+
+namespace kungpu {
+
+#define GEN_PASS_DECL
+#include "KunGpu/Passes.h.inc"
+
+// Default-args factory used by the pipeline registration and kun-opt.
+std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass();
+
+// Parametric factory for use by callers that provide hardware config.
+std::unique_ptr<mlir::Pass>
+createWindowedTempMemoryPlanningPass(int64_t totalSmemSize,
+                                     int64_t targetOccupancy,
+                                     int64_t numThreadsPerBlock,
+                                     int64_t vectorSize);
+
+#define GEN_PASS_REGISTRATION
+#include "KunGpu/Passes.h.inc"
+
+} // namespace kungpu
diff --git a/mlir/include/KunGpu/Passes.td b/mlir/include/KunGpu/Passes.td
new file mode 100644
index 0000000..972478f
--- /dev/null
+++ b/mlir/include/KunGpu/Passes.td
@@ -0,0 +1,32 @@
+#ifndef KUNGPU_PASSES_TD
+#define KUNGPU_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+def WindowedTempMemoryPlanning
+    : Pass<"kungpu-memory-planning", "::mlir::func::FuncOp"> {
+  let summary = "Assign shared/local memory to kungpu.windowed_temp ops";
+  let description = [{
+    Greedy memory planning pass for kungpu.windowed_temp allocations.
+
+    Sorts windowed_temp ops by ascending window size and assigns shared memory
+    to as many as fit within the per-block budget:
+
+      budget_per_block = total_smem_size / target_occupancy
+      bytes_per_buffer = N * num_threads_per_block * vector_size * elem_bytes
+
+    The result is written as a discardable boolean attribute "kungpu.smem" on
+    each windowed_temp op.  The pass does not change IR structure; address-space
+    selection is deferred to the to-LLVM lowering.
+
+    Parameters (passed at construction, not CLI options):
+      total_smem_size      - device shared memory in bytes
+      target_occupancy     - concurrent blocks per SM
+      num_threads_per_block
+      vector_size          - scalar elements per thread per time step
+  }];
+  let constructor = "::kungpu::createWindowedTempMemoryPlanningPass()";
+  let dependentDialects = ["::mlir::func::FuncDialect"];
+}
+
+#endif // KUNGPU_PASSES_TD
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index 8b6d34f..8ac3898 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -1,16 +1,20 @@
 add_mlir_dialect_library(MLIRKunGpuDialect
   KunGpuDialect.cpp
   KunGpuOps.cpp
+  KunGpuMemoryPlanning.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/mlir/include
 
   DEPENDS
   MLIRKunGpuOpsIncGen
+  MLIRKunGpuPassIncGen
   MLIRKunIrOpsIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
+  MLIRPass
+  MLIRFuncDialect
   MLIRSideEffectInterfaces
   MLIRKunIrDialect
 )
diff --git a/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
new file mode 100644
index 0000000..f61c7f8
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
@@ -0,0 +1,171 @@
+//===- KunGpuMemoryPlanning.cpp - Windowed-temp shared/local memory plan --===//
+//
+// Assigns each kungpu.windowed_temp op a "use shared memory" flag stored as
+// the discardable attribute "kungpu.smem" (BoolAttr).  The pass itself does
+// not mutate IR structure; the subsequent to-LLVM lowering consults the attr
+// to pick an address space.
+//
+// Strategy: sort windowed_temp ops by ascending window size (smaller buffers
+// fit more easily into shared memory) and greedily assign shared memory until
+// the per-block budget is exhausted.
+//
+// Budget:
+//   smem_per_block  = total_smem_size / target_occupancy
+//   bytes_per_buf   = N * num_threads_per_block * vector_size * elem_bytes
+//
+// Parameters are passed as plain integers (not IR attributes) because the
+// hardware/launch-config attributes are not yet wired into the IR.
+//
+//===----------------------------------------------------------------------===//
+
+// MLIR and local headers must come before GEN_PASS_DEF_* so that ::mlir
+// and func::FuncOp are fully declared when Passes.h.inc is expanded.
+#include "KunGpu/KunGpuOps.h"
+#include "KunGpu/Passes.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+// Pull in the generated PassBase scaffolding after all declarations are in scope.
+#define GEN_PASS_DEF_WINDOWEDTEMPMEMORYPLANNING
+#include "KunGpu/Passes.h.inc"
+
+#define DEBUG_TYPE "kungpu-memory-planning"
+
+using namespace mlir;
+using namespace kunir;
+using namespace kungpu;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper: byte width of a floating-point element type
+//===----------------------------------------------------------------------===//
+
+static unsigned elemBytes(Type t) {
+  if (auto ft = dyn_cast<FloatType>(t))
+    return (ft.getWidth() + 7) / 8;
+  return 4; // conservative fallback
+}
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+struct WindowedTempMemoryPlanningPass
+    : ::impl::WindowedTempMemoryPlanningBase<WindowedTempMemoryPlanningPass> {
+
+  // Parametric constructor — used by createWindowedTempMemoryPlanningPass().
+  WindowedTempMemoryPlanningPass(int64_t totalSmemSize, int64_t targetOccupancy,
+                                 int64_t numThreadsPerBlock, int64_t vectorSize)
+      : totalSmemSize(totalSmemSize), targetOccupancy(targetOccupancy),
+        numThreadsPerBlock(numThreadsPerBlock), vectorSize(vectorSize) {}
+
+  // Default constructor — used by the pipeline registration factory.
+  WindowedTempMemoryPlanningPass()
+      : WindowedTempMemoryPlanningPass(
+            /*totalSmemSize=*/49152, // 48 KB (typical Ampere)
+            /*targetOccupancy=*/1,
+            /*numThreadsPerBlock=*/32,
+            /*vectorSize=*/1) {}
+
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // -----------------------------------------------------------------------
+    // 1. Collect all windowed_temp ops in the function.
+    // -----------------------------------------------------------------------
+    SmallVector<WindowedTempOp> temps;
+    funcOp.walk([&](WindowedTempOp op) { temps.push_back(op); });
+
+    if (temps.empty())
+      return;
+
+    // -----------------------------------------------------------------------
+    // 2. Compute per-block shared memory budget (bytes).
+    // -----------------------------------------------------------------------
+    int64_t budgetPerBlock =
+        (targetOccupancy > 0) ? (totalSmemSize / targetOccupancy) : 0;
+
+    // -----------------------------------------------------------------------
+    // 3. Sort by ascending window size (smaller N → higher smem priority).
+    // -----------------------------------------------------------------------
+    std::stable_sort(temps.begin(), temps.end(),
+                     [](WindowedTempOp a, WindowedTempOp b) {
+                       return llvm::cast<TsType>(a.getType()).getMaxLookback() <
+                              llvm::cast<TsType>(b.getType()).getMaxLookback();
+                     });
+
+    // -----------------------------------------------------------------------
+    // 4. Greedy assignment: place in shared memory while budget allows.
+    // -----------------------------------------------------------------------
+    int64_t usedSmem = 0;
+
+    for (WindowedTempOp op : temps) {
+      auto tsTy = llvm::cast<TsType>(op.getType());
+      uint64_t N = tsTy.getMaxLookback();
+
+      // Infinite-lookback buffers cannot be sized statically → always local.
+      if (N == std::numeric_limits<uint64_t>::max()) {
+        op->setAttr("kungpu.smem", BoolAttr::get(ctx, false));
+        continue;
+      }
+
+      int64_t bytes = static_cast<int64_t>(N) * numThreadsPerBlock *
+                      vectorSize * elemBytes(tsTy.getElementType());
+
+      bool useSmem =
+          (budgetPerBlock > 0) && (usedSmem + bytes <= budgetPerBlock);
+      if (useSmem)
+        usedSmem += bytes;
+
+      op->setAttr("kungpu.smem", BoolAttr::get(ctx, useSmem));
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "[kungpu-memory-planning] windowed_temp N=" << N
+                 << " bytes=" << bytes << " -> "
+                 << (useSmem ? "smem" : "local") << "\n");
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "[kungpu-memory-planning] total smem used="
+                            << usedSmem << " / budget=" << budgetPerBlock
+                            << "\n");
+  }
+
+  int64_t totalSmemSize;
+  int64_t targetOccupancy;
+  int64_t numThreadsPerBlock;
+  int64_t vectorSize;
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Public factory functions
+//===----------------------------------------------------------------------===//
+
+namespace kungpu {
+
+std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass() {
+  return std::make_unique<WindowedTempMemoryPlanningPass>();
+}
+
+std::unique_ptr<mlir::Pass>
+createWindowedTempMemoryPlanningPass(int64_t totalSmemSize,
+                                     int64_t targetOccupancy,
+                                     int64_t numThreadsPerBlock,
+                                     int64_t vectorSize) {
+  return std::make_unique<WindowedTempMemoryPlanningPass>(
+      totalSmemSize, targetOccupancy, numThreadsPerBlock, vectorSize);
+}
+
+} // namespace kungpu
diff --git a/mlir/test/kungpu/memory_planning.mlir b/mlir/test/kungpu/memory_planning.mlir
new file mode 100644
index 0000000..0d55b18
--- /dev/null
+++ b/mlir/test/kungpu/memory_planning.mlir
@@ -0,0 +1,65 @@
+// RUN: %kun-opt --kungpu-memory-planning %s | %FileCheck %s
+//
+// Default pass parameters: total_smem=49152 bytes, occupancy=1,
+//                          num_threads=32, vec=1 → budget=49152 bytes
+//
+// Buffer cost (f32=4 bytes): bytes = N * 32 * 1 * 4 = N * 128
+//   N=3   →   384 bytes
+//   N=5   →   640 bytes
+//   N=10  →  1280 bytes
+//   N=400 → 51200 bytes  (> 49152)
+//   N=500 → 64000 bytes  (> 49152)
+//
+// Case 1 – all smem:   N=3  (384) + N=5  (640) + N=10  (1280) = 2304  ≤ budget
+// Case 2 – mixed:      N=5  (640) → smem; N=400 (51200) → 640+51200 > budget → local
+// Case 3 – all local:  N=400 (51200) > budget → local; N=500 (64000) > budget → local
+//
+// The pass sorts ops by ascending N before assigning, so declaration order
+// in the IR does not affect the assignment.
+
+// -----------------------------------------------------------------------
+// Case 1: all three buffers fit in shared memory
+// -----------------------------------------------------------------------
+
+// CHECK-LABEL: func.func @test_all_smem
+func.func @test_all_smem() {
+  // Declared in reverse order to verify sort-by-N behaviour.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 10> {kungpu.smem = true}
+  %c = kungpu.windowed_temp : !kunir.ts<f32, 10>
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 3> {kungpu.smem = true}
+  %a = kungpu.windowed_temp : !kunir.ts<f32, 3>
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 5> {kungpu.smem = true}
+  %b = kungpu.windowed_temp : !kunir.ts<f32, 5>
+  return
+}
+
+// -----------------------------------------------------------------------
+// Case 2: small buffer goes to smem, large buffer spills to local memory
+// -----------------------------------------------------------------------
+
+// CHECK-LABEL: func.func @test_mixed
+func.func @test_mixed() {
+  // N=400 (51200 bytes) is declared first but sorted after N=5 (640 bytes).
+  // N=5 takes 640 bytes of the 49152-byte budget; N=400 would need 51200
+  // more, which exceeds the remaining 48512 bytes → local.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
+  %big = kungpu.windowed_temp : !kunir.ts<f32, 400>
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 5> {kungpu.smem = true}
+  %small = kungpu.windowed_temp : !kunir.ts<f32, 5>
+  return
+}
+
+// -----------------------------------------------------------------------
+// Case 3: every buffer exceeds the budget on its own → all local memory
+// -----------------------------------------------------------------------
+
+// CHECK-LABEL: func.func @test_all_local
+func.func @test_all_local() {
+  // N=400 → 51200 bytes > 49152 (budget), so smem=false.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
+  %a = kungpu.windowed_temp : !kunir.ts<f32, 400>
+  // N=500 → 64000 bytes > 49152 (budget), so smem=false.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 500> {kungpu.smem = false}
+  %b = kungpu.windowed_temp : !kunir.ts<f32, 500>
+  return
+}

From 618ed0830e6b18adf2ebb821841724c43c6c8bff Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 6 May 2026 02:46:11 -0700
Subject: [PATCH 04/59] kunir.func

---
 mlir/include/KunGpu/Passes.h             |  12 +-
 mlir/include/KunGpu/Passes.td            |  18 +-
 mlir/include/KunIr/CMakeLists.txt        |   6 +
 mlir/include/KunIr/KunIrAttrs.h          |   8 +
 mlir/include/KunIr/KunIrAttrs.td         |  30 +++
 mlir/include/KunIr/KunIrDialect.td       |   2 +
 mlir/include/KunIr/KunIrOps.h            |   1 +
 mlir/include/KunIr/KunIrOps.td           |  79 +++++++
 mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp |  70 ++----
 mlir/lib/KunIr/CMakeLists.txt            |   3 +
 mlir/lib/KunIr/KunIrAttrs.cpp            |  55 +++++
 mlir/lib/KunIr/KunIrDialect.cpp          |   2 +
 mlir/lib/KunIr/KunIrOps.cpp              | 260 +++++++++++++++++++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp         |  26 ++-
 mlir/test/kungpu/basic.mlir              |  45 ++--
 mlir/test/kungpu/memory_planning.mlir    |  49 +++--
 mlir/test/kunir/basic.mlir               | 113 ++++++----
 mlir/test/kunir/func.mlir                |  66 ++++++
 mlir/test/kunir/lower_to_kungpu.mlir     |  58 +++--
 19 files changed, 723 insertions(+), 180 deletions(-)
 create mode 100644 mlir/include/KunIr/KunIrAttrs.h
 create mode 100644 mlir/include/KunIr/KunIrAttrs.td
 create mode 100644 mlir/lib/KunIr/KunIrAttrs.cpp
 create mode 100644 mlir/test/kunir/func.mlir

diff --git a/mlir/include/KunGpu/Passes.h b/mlir/include/KunGpu/Passes.h
index d28100d..0a727a0 100644
--- a/mlir/include/KunGpu/Passes.h
+++ b/mlir/include/KunGpu/Passes.h
@@ -1,7 +1,9 @@
 #pragma once
 
+// KunIrOps.h must be included before Passes.h.inc so that ::kunir::FuncOp
+// is fully declared when the OperationPass<::kunir::FuncOp> template is used.
+#include "KunIr/KunIrOps.h"
 #include "mlir/Pass/Pass.h"
-#include <cstdint>
 #include <memory>
 
 namespace kungpu {
@@ -9,16 +11,8 @@ namespace kungpu {
 #define GEN_PASS_DECL
 #include "KunGpu/Passes.h.inc"
 
-// Default-args factory used by the pipeline registration and kun-opt.
 std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass();
 
-// Parametric factory for use by callers that provide hardware config.
-std::unique_ptr<mlir::Pass>
-createWindowedTempMemoryPlanningPass(int64_t totalSmemSize,
-                                     int64_t targetOccupancy,
-                                     int64_t numThreadsPerBlock,
-                                     int64_t vectorSize);
-
 #define GEN_PASS_REGISTRATION
 #include "KunGpu/Passes.h.inc"
 
diff --git a/mlir/include/KunGpu/Passes.td b/mlir/include/KunGpu/Passes.td
index 972478f..d3bc7f1 100644
--- a/mlir/include/KunGpu/Passes.td
+++ b/mlir/include/KunGpu/Passes.td
@@ -4,29 +4,25 @@
 include "mlir/Pass/PassBase.td"
 
 def WindowedTempMemoryPlanning
-    : Pass<"kungpu-memory-planning", "::mlir::func::FuncOp"> {
+    : Pass<"kungpu-memory-planning", "::kunir::FuncOp"> {
   let summary = "Assign shared/local memory to kungpu.windowed_temp ops";
   let description = [{
     Greedy memory planning pass for kungpu.windowed_temp allocations.
 
     Sorts windowed_temp ops by ascending window size and assigns shared memory
-    to as many as fit within the per-block budget:
+    to as many as fit within the per-block budget.  All parameters are read
+    from the enclosing kunir.func's target_spec attribute:
 
-      budget_per_block = total_smem_size / target_occupancy
-      bytes_per_buffer = N * num_threads_per_block * vector_size * elem_bytes
+      budget_per_block = target_spec.smem_size / target_spec.occupancy
+      num_threads      = target_spec.warps_per_cta * 32
+      bytes_per_buffer = N * num_threads * target_spec.vector_size * elem_bytes
 
     The result is written as a discardable boolean attribute "kungpu.smem" on
     each windowed_temp op.  The pass does not change IR structure; address-space
     selection is deferred to the to-LLVM lowering.
-
-    Parameters (passed at construction, not CLI options):
-      total_smem_size      - device shared memory in bytes
-      target_occupancy     - concurrent blocks per SM
-      num_threads_per_block
-      vector_size          - scalar elements per thread per time step
   }];
   let constructor = "::kungpu::createWindowedTempMemoryPlanningPass()";
-  let dependentDialects = ["::mlir::func::FuncDialect"];
+  let dependentDialects = ["::kungpu::KunGpuDialect"];
 }
 
 #endif // KUNGPU_PASSES_TD
diff --git a/mlir/include/KunIr/CMakeLists.txt b/mlir/include/KunIr/CMakeLists.txt
index a3c44d7..15db184 100644
--- a/mlir/include/KunIr/CMakeLists.txt
+++ b/mlir/include/KunIr/CMakeLists.txt
@@ -1,2 +1,8 @@
 add_mlir_dialect(KunIrOps kunir)
 add_mlir_interface(KunIrInterfaces)
+
+# AttrDef generation (add_mlir_dialect does not cover attrdefs)
+set(LLVM_TARGET_DEFINITIONS KunIrOps.td)
+mlir_tablegen(KunIrOpsAttrDefs.h.inc -gen-attrdef-decls -attrdefs-dialect=kunir)
+mlir_tablegen(KunIrOpsAttrDefs.cpp.inc -gen-attrdef-defs  -attrdefs-dialect=kunir)
+add_mlir_dialect_tablegen_target(MLIRKunIrAttrsIncGen)
diff --git a/mlir/include/KunIr/KunIrAttrs.h b/mlir/include/KunIr/KunIrAttrs.h
new file mode 100644
index 0000000..3e5eb76
--- /dev/null
+++ b/mlir/include/KunIr/KunIrAttrs.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "KunIr/KunIrDialect.h"
+
+#define GET_ATTRDEF_CLASSES
+#include "KunIr/KunIrOpsAttrDefs.h.inc"
diff --git a/mlir/include/KunIr/KunIrAttrs.td b/mlir/include/KunIr/KunIrAttrs.td
new file mode 100644
index 0000000..97a7ddc
--- /dev/null
+++ b/mlir/include/KunIr/KunIrAttrs.td
@@ -0,0 +1,30 @@
+#ifndef KUNIR_ATTRS_TD
+#define KUNIR_ATTRS_TD
+
+include "KunIr/KunIrDialect.td"
+
+// TargetSpec attribute: hardware parameters for a kunir.func op.
+def KunIr_TargetSpecAttr : AttrDef<KunIr_Dialect, "TargetSpec"> {
+  let mnemonic = "target_spec";
+  let summary = "Hardware target parameters for a kunir function";
+  let description = [{
+    Describes the GPU launch configuration for a kunir function:
+      occupancy      — target concurrent blocks per SM
+      warps_per_cta  — warps per thread block
+      smem_size      — total shared memory bytes available on one SM
+                       (per-block budget = smem_size / occupancy)
+      vector_size    — scalar elements per thread per time step
+
+    Printed inline inside kunir.func as:
+      target {occupancy = N, warps_per_cta = N, smem_size = N, vector_size = N}
+  }];
+  let parameters = (ins
+    "int64_t":$occupancy,
+    "int64_t":$warps_per_cta,
+    "int64_t":$smem_size,
+    "int64_t":$vector_size
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+#endif // KUNIR_ATTRS_TD
diff --git a/mlir/include/KunIr/KunIrDialect.td b/mlir/include/KunIr/KunIrDialect.td
index def6242..f92be23 100644
--- a/mlir/include/KunIr/KunIrDialect.td
+++ b/mlir/include/KunIr/KunIrDialect.td
@@ -13,8 +13,10 @@ def KunIr_Dialect : Dialect {
   }];
   let cppNamespace = "::kunir";
   let useDefaultTypePrinterParser = 1;
+  let useDefaultAttributePrinterParser = 1;
   let extraClassDeclaration = [{
     void registerTypes();
+    void registerAttrs();
   }];
 }
 
diff --git a/mlir/include/KunIr/KunIrOps.h b/mlir/include/KunIr/KunIrOps.h
index 99b7206..a45fabd 100644
--- a/mlir/include/KunIr/KunIrOps.h
+++ b/mlir/include/KunIr/KunIrOps.h
@@ -8,6 +8,7 @@
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "KunIr/KunIrAttrs.h"
 #include "KunIr/KunIrDialect.h"
 #include "KunIr/KunIrInterfaces.h"
 #include "KunIr/KunIrTypes.h"
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index bce2df2..e07572f 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -4,10 +4,12 @@
 include "KunIr/KunIrDialect.td"
 include "KunIr/KunIrTypes.td"
 include "KunIr/KunIrInterfaces.td"
+include "KunIr/KunIrAttrs.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/OpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
 
 class KunIr_Op<string mnemonic, list<Trait> traits = []>
     : Op<KunIr_Dialect, mnemonic, traits>;
@@ -219,4 +221,81 @@ def KunIr_ReduceMinOp : KunIr_ReduceOp<"reduce_min"> {
   let summary = "Minimum reduction over the back window";
 }
 
+//===----------------------------------------------------------------------===//
+// FuncOp — function with named I/O and hardware target spec
+//
+// Custom assembly format:
+//
+//   kunir.func @name(%arg0: type0, ...)
+//       inputs {%arg0 = "name0", ...}
+//       outputs {"out0", ...}            // non-void: one string per result
+//       target {occupancy = V, warps_per_cta = V, smem_size = V}
+//       -> (result_type0, ...) {
+//     body
+//   }
+//
+//   kunir.func @name(%arg0: type0, %arg1: type1)
+//       inputs {%arg0 = "input0"}
+//       outputs {%arg1 = "output0"}      // void: %argN = "name" form
+//       target {...} {
+//     body
+//   }
+//
+// Constraints (void case):   len(inputs) + len(outputs) == len(block_args)
+// Constraints (non-void):    len(inputs) == len(block_args),
+//                            len(outputs) == num_results
+//===----------------------------------------------------------------------===//
+
+def KunIr_FuncOp : KunIr_Op<"func", [
+    IsolatedFromAbove,
+    Symbol,
+    SingleBlockImplicitTerminator<"::kunir::ReturnOp">
+]> {
+  let summary = "KunQuant function with named I/O and hardware target spec";
+  let arguments = (ins
+    StrAttr:$sym_name,
+    TypeAttr:$function_type,
+    ArrayAttr:$input_names,
+    ArrayAttr:$output_names,
+    KunIr_TargetSpecAttr:$target_spec
+  );
+  let regions = (region SizedRegion<1>:$body);
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<(ins "::llvm::StringRef":$name,
+                   "::mlir::FunctionType":$type,
+                   "::mlir::ArrayAttr":$inputNames,
+                   "::mlir::ArrayAttr":$outputNames,
+                   "::kunir::TargetSpecAttr":$targetSpec)>
+  ];
+  let extraClassDeclaration = [{
+    // getFunctionType() is generated by tablegen and returns mlir::Type.
+    // This typed helper casts it for callers that need mlir::FunctionType.
+    ::mlir::FunctionType getFunctionTypeTyped() {
+      return llvm::cast<::mlir::FunctionType>(getFunctionType());
+    }
+    ::mlir::Block &getBodyBlock() { return getBody().front(); }
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp — terminator for kunir.func body
+//===----------------------------------------------------------------------===//
+
+def KunIr_ReturnOp : KunIr_Op<"return", [
+    Pure,
+    Terminator,
+    ReturnLike,
+    HasParent<"::kunir::FuncOp">
+]> {
+  let summary = "Return from a kunir.func body";
+  let arguments = (ins Variadic<AnyType>:$operands);
+  let assemblyFormat = "($operands^ `:` type($operands))? attr-dict";
+  let hasVerifier = 1;
+  // Zero-arg builder used by SingleBlockImplicitTerminator (ensureTerminator).
+  let builders = [OpBuilder<(ins), [{}]>];
+}
+
 #endif // KUNIR_OPS_TD
diff --git a/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
index f61c7f8..c6ae579 100644
--- a/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
+++ b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
@@ -9,22 +9,21 @@
 // fit more easily into shared memory) and greedily assign shared memory until
 // the per-block budget is exhausted.
 //
-// Budget:
-//   smem_per_block  = total_smem_size / target_occupancy
-//   bytes_per_buf   = N * num_threads_per_block * vector_size * elem_bytes
-//
-// Parameters are passed as plain integers (not IR attributes) because the
-// hardware/launch-config attributes are not yet wired into the IR.
+// Budget (from the enclosing kunir.func target_spec):
+//   budget_per_block = target_spec.smem_size / target_spec.occupancy
+//   num_threads      = target_spec.warps_per_cta * 32
+//   bytes_per_buf    = N * num_threads * target_spec.vector_size * elem_bytes
 //
 //===----------------------------------------------------------------------===//
 
-// MLIR and local headers must come before GEN_PASS_DEF_* so that ::mlir
-// and func::FuncOp are fully declared when Passes.h.inc is expanded.
+// MLIR and local headers must come before GEN_PASS_DEF_* so that ::kunir
+// is fully declared when Passes.h.inc is expanded.
 #include "KunGpu/KunGpuOps.h"
 #include "KunGpu/Passes.h"
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
 #include "KunIr/KunIrTypes.h"
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "llvm/ADT/SmallVector.h"
@@ -63,26 +62,23 @@ static unsigned elemBytes(Type t) {
 struct WindowedTempMemoryPlanningPass
     : ::impl::WindowedTempMemoryPlanningBase<WindowedTempMemoryPlanningPass> {
 
-  // Parametric constructor — used by createWindowedTempMemoryPlanningPass().
-  WindowedTempMemoryPlanningPass(int64_t totalSmemSize, int64_t targetOccupancy,
-                                 int64_t numThreadsPerBlock, int64_t vectorSize)
-      : totalSmemSize(totalSmemSize), targetOccupancy(targetOccupancy),
-        numThreadsPerBlock(numThreadsPerBlock), vectorSize(vectorSize) {}
-
-  // Default constructor — used by the pipeline registration factory.
-  WindowedTempMemoryPlanningPass()
-      : WindowedTempMemoryPlanningPass(
-            /*totalSmemSize=*/49152, // 48 KB (typical Ampere)
-            /*targetOccupancy=*/1,
-            /*numThreadsPerBlock=*/32,
-            /*vectorSize=*/1) {}
-
   void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
+    kunir::FuncOp funcOp = getOperation();
     MLIRContext *ctx = &getContext();
 
     // -----------------------------------------------------------------------
-    // 1. Collect all windowed_temp ops in the function.
+    // 1. Read hardware parameters from target_spec.
+    //    smem_size is the total SM shared memory; divide by occupancy to get
+    //    the per-block budget.
+    // -----------------------------------------------------------------------
+    auto ts = funcOp.getTargetSpec();
+    int64_t occupancy       = ts.getOccupancy();
+    int64_t budgetPerBlock  = (occupancy > 0) ? (ts.getSmemSize() / occupancy) : 0;
+    int64_t numThreads      = ts.getWarpsPerCta() * 32;
+    int64_t vectorSize      = ts.getVectorSize();
+
+    // -----------------------------------------------------------------------
+    // 2. Collect all windowed_temp ops in the function.
     // -----------------------------------------------------------------------
     SmallVector<WindowedTempOp> temps;
     funcOp.walk([&](WindowedTempOp op) { temps.push_back(op); });
@@ -90,12 +86,6 @@ struct WindowedTempMemoryPlanningPass
     if (temps.empty())
       return;
 
-    // -----------------------------------------------------------------------
-    // 2. Compute per-block shared memory budget (bytes).
-    // -----------------------------------------------------------------------
-    int64_t budgetPerBlock =
-        (targetOccupancy > 0) ? (totalSmemSize / targetOccupancy) : 0;
-
     // -----------------------------------------------------------------------
     // 3. Sort by ascending window size (smaller N → higher smem priority).
     // -----------------------------------------------------------------------
@@ -120,7 +110,7 @@ struct WindowedTempMemoryPlanningPass
         continue;
       }
 
-      int64_t bytes = static_cast<int64_t>(N) * numThreadsPerBlock *
+      int64_t bytes = static_cast<int64_t>(N) * numThreads *
                       vectorSize * elemBytes(tsTy.getElementType());
 
       bool useSmem =
@@ -140,17 +130,12 @@ struct WindowedTempMemoryPlanningPass
                             << usedSmem << " / budget=" << budgetPerBlock
                             << "\n");
   }
-
-  int64_t totalSmemSize;
-  int64_t targetOccupancy;
-  int64_t numThreadsPerBlock;
-  int64_t vectorSize;
 };
 
 } // namespace
 
 //===----------------------------------------------------------------------===//
-// Public factory functions
+// Public factory function
 //===----------------------------------------------------------------------===//
 
 namespace kungpu {
@@ -159,13 +144,4 @@ std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass() {
   return std::make_unique<WindowedTempMemoryPlanningPass>();
 }
 
-std::unique_ptr<mlir::Pass>
-createWindowedTempMemoryPlanningPass(int64_t totalSmemSize,
-                                     int64_t targetOccupancy,
-                                     int64_t numThreadsPerBlock,
-                                     int64_t vectorSize) {
-  return std::make_unique<WindowedTempMemoryPlanningPass>(
-      totalSmemSize, targetOccupancy, numThreadsPerBlock, vectorSize);
-}
-
 } // namespace kungpu
diff --git a/mlir/lib/KunIr/CMakeLists.txt b/mlir/lib/KunIr/CMakeLists.txt
index 317f4a0..7bc4c0b 100644
--- a/mlir/lib/KunIr/CMakeLists.txt
+++ b/mlir/lib/KunIr/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRKunIrDialect
   KunIrTypes.cpp
   KunIrInterfaces.cpp
   KunIrOps.cpp
+  KunIrAttrs.cpp
 
   PARTIAL_SOURCES_INTENDED
 
@@ -12,9 +13,11 @@ add_mlir_dialect_library(MLIRKunIrDialect
   DEPENDS
   MLIRKunIrOpsIncGen
   MLIRKunIrInterfacesIncGen
+  MLIRKunIrAttrsIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
+  MLIRFuncDialect
   MLIRSideEffectInterfaces
 )
 
diff --git a/mlir/lib/KunIr/KunIrAttrs.cpp b/mlir/lib/KunIr/KunIrAttrs.cpp
new file mode 100644
index 0000000..081e377
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrAttrs.cpp
@@ -0,0 +1,55 @@
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace kunir;
+
+#define GET_ATTRDEF_CLASSES
+#include "KunIr/KunIrOpsAttrDefs.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetSpecAttr — custom assembly format
+//
+// Inline format (used inside kunir.func):
+//   {occupancy = V, warps_per_cta = V, smem_size = V}
+//
+// Canonical MLIR attribute form (used stand-alone):
+//   #kunir.target_spec<{occupancy = V, warps_per_cta = V, smem_size = V}>
+//===----------------------------------------------------------------------===//
+
+Attribute TargetSpecAttr::parse(AsmParser &parser, Type) {
+  int64_t occupancy = 0, warpsPerCta = 0, smemSize = 0, vectorSize = 1;
+  if (parser.parseLBrace() ||
+      parser.parseKeyword("occupancy") || parser.parseEqual() ||
+      parser.parseInteger(occupancy) || parser.parseComma() ||
+      parser.parseKeyword("warps_per_cta") || parser.parseEqual() ||
+      parser.parseInteger(warpsPerCta) || parser.parseComma() ||
+      parser.parseKeyword("smem_size") || parser.parseEqual() ||
+      parser.parseInteger(smemSize) || parser.parseComma() ||
+      parser.parseKeyword("vector_size") || parser.parseEqual() ||
+      parser.parseInteger(vectorSize) || parser.parseRBrace())
+    return {};
+  return TargetSpecAttr::get(parser.getContext(), occupancy, warpsPerCta,
+                              smemSize, vectorSize);
+}
+
+void TargetSpecAttr::print(AsmPrinter &printer) const {
+  printer << "{occupancy = " << getOccupancy()
+          << ", warps_per_cta = " << getWarpsPerCta()
+          << ", smem_size = " << getSmemSize()
+          << ", vector_size = " << getVectorSize() << "}";
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect attr registration
+//===----------------------------------------------------------------------===//
+
+void KunIrDialect::registerAttrs() {
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "KunIr/KunIrOpsAttrDefs.cpp.inc"
+  >();
+}
diff --git a/mlir/lib/KunIr/KunIrDialect.cpp b/mlir/lib/KunIr/KunIrDialect.cpp
index 9eb1a62..ee03c47 100644
--- a/mlir/lib/KunIr/KunIrDialect.cpp
+++ b/mlir/lib/KunIr/KunIrDialect.cpp
@@ -1,3 +1,4 @@
+#include "KunIr/KunIrAttrs.h"
 #include "KunIr/KunIrDialect.h"
 #include "KunIr/KunIrOps.h"
 #include "KunIr/KunIrTypes.h"
@@ -17,4 +18,5 @@ void KunIrDialect::initialize() {
 #include "KunIr/KunIrOps.cpp.inc"
   >();
   registerTypes();
+  registerAttrs();
 }
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index e17378a..638ae4f 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -1,4 +1,5 @@
 #include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrAttrs.h"
 #include "KunIr/KunIrInterfaces.h"
 #include "KunIr/KunIrTypes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -35,6 +36,10 @@ void kunir::YieldOp::build(mlir::OpBuilder &, mlir::OperationState &) {
   // Empty build: produces a zero-operand yield for ensureTerminator.
 }
 
+void kunir::ReturnOp::build(mlir::OpBuilder &, mlir::OperationState &) {
+  // Empty build: produces a zero-operand return for ensureTerminator.
+}
+
 //===----------------------------------------------------------------------===//
 // Binary elemwise ops — verify only (inferReturnTypes is in ElemwiseTsResultType)
 //===----------------------------------------------------------------------===//
@@ -369,3 +374,258 @@ TypedAttr ReduceMinOp::getInitValue(FloatType elemType) {
 Value ReduceMinOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
   return b.create<arith::MinimumFOp>(loc, acc, elem);
 }
+
+//===----------------------------------------------------------------------===//
+// FuncOp
+//===----------------------------------------------------------------------===//
+
+void FuncOp::build(OpBuilder &b, OperationState &result,
+                   StringRef name, FunctionType type,
+                   ArrayAttr inputNames, ArrayAttr outputNames,
+                   TargetSpecAttr targetSpec) {
+  result.addAttribute(getSymNameAttrName(result.name), b.getStringAttr(name));
+  result.addAttribute(getFunctionTypeAttrName(result.name), TypeAttr::get(type));
+  result.addAttribute(getInputNamesAttrName(result.name), inputNames);
+  result.addAttribute(getOutputNamesAttrName(result.name), outputNames);
+  result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
+  Region *body = result.addRegion();
+  Block *block = new Block;
+  for (Type inputType : type.getInputs())
+    block->addArgument(inputType, result.location);
+  body->push_back(block);
+}
+
+LogicalResult FuncOp::verify() {
+  FunctionType ft = getFunctionTypeTyped();
+  Block &block = getBodyBlock();
+
+  // Block args must match function input types
+  if (block.getNumArguments() != ft.getNumInputs())
+    return emitOpError("body block has ") << block.getNumArguments()
+           << " args but function type has " << ft.getNumInputs() << " inputs";
+  for (auto [i, argType] : llvm::enumerate(ft.getInputs())) {
+    if (block.getArgument(i).getType() != argType)
+      return emitOpError("block arg #") << i << " type mismatch";
+  }
+
+  // Validate input_names / output_names counts
+  auto inputNames  = getInputNames();
+  auto outputNames = getOutputNames();
+  unsigned numResults = ft.getNumResults();
+
+  if (numResults > 0) {
+    // Non-void: inputs == num_args, outputs == num_results
+    if (inputNames.size() != ft.getNumInputs())
+      return emitOpError("non-void func: input_names count (")
+             << inputNames.size() << ") != num args ("
+             << ft.getNumInputs() << ")";
+    if (outputNames.size() != numResults)
+      return emitOpError("non-void func: output_names count (")
+             << outputNames.size() << ") != num results (" << numResults << ")";
+  } else {
+    // Void: inputs + outputs == num_args
+    if (inputNames.size() + outputNames.size() != ft.getNumInputs())
+      return emitOpError("void func: input_names + output_names count (")
+             << (inputNames.size() + outputNames.size())
+             << ") != num args (" << ft.getNumInputs() << ")";
+  }
+
+  // Validate all names are StringAttr
+  for (auto [i, a] : llvm::enumerate(inputNames))
+    if (!llvm::isa<StringAttr>(a))
+      return emitOpError("input_names[") << i << "] is not a StringAttr";
+  for (auto [i, a] : llvm::enumerate(outputNames))
+    if (!llvm::isa<StringAttr>(a))
+      return emitOpError("output_names[") << i << "] is not a StringAttr";
+
+  // Validate target_spec
+  auto ts = getTargetSpec();
+  if (ts.getOccupancy() <= 0)
+    return emitOpError("target occupancy must be positive, got ")
+           << ts.getOccupancy();
+  if (ts.getWarpsPerCta() <= 0)
+    return emitOpError("target warps_per_cta must be positive, got ")
+           << ts.getWarpsPerCta();
+  if (ts.getSmemSize() < 0)
+    return emitOpError("target smem_size must be non-negative, got ")
+           << ts.getSmemSize();
+
+  return success();
+}
+
+ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
+  Builder &b = parser.getBuilder();
+
+  // @sym_name
+  StringAttr nameAttr;
+  if (parser.parseSymbolName(nameAttr, getSymNameAttrName(result.name),
+                             result.attributes))
+    return failure();
+
+  // (%arg0 : type0, ...)
+  SmallVector<OpAsmParser::Argument> blockArgs;
+  if (parser.parseArgumentList(blockArgs, OpAsmParser::Delimiter::Paren,
+                               /*allowType=*/true, /*allowAttrs=*/false))
+    return failure();
+
+  // inputs { %name = "str", ... }
+  SmallVector<Attribute> inputNameAttrs;
+  if (parser.parseKeyword("inputs") || parser.parseLBrace())
+    return failure();
+  if (parser.parseOptionalRBrace().failed()) {
+    do {
+      OpAsmParser::UnresolvedOperand argRef;
+      StringAttr nameStr;
+      if (parser.parseOperand(argRef) || parser.parseEqual() ||
+          parser.parseAttribute(nameStr))
+        return failure();
+      inputNameAttrs.push_back(nameStr);
+    } while (parser.parseOptionalComma().succeeded());
+    if (parser.parseRBrace()) return failure();
+  }
+
+  // outputs { ["str", ...] | [%name = "str", ...] }
+  SmallVector<Attribute> outputNameAttrs;
+  if (parser.parseKeyword("outputs") || parser.parseLBrace())
+    return failure();
+  if (parser.parseOptionalRBrace().failed()) {
+    do {
+      // Try %name = "str" form; if no %, fall through to "str" form
+      OpAsmParser::UnresolvedOperand argRef;
+      auto optArg = parser.parseOptionalOperand(argRef);
+      if (optArg.has_value()) {
+        if (failed(*optArg) || parser.parseEqual()) return failure();
+      }
+      StringAttr nameStr;
+      if (parser.parseAttribute(nameStr)) return failure();
+      outputNameAttrs.push_back(nameStr);
+    } while (parser.parseOptionalComma().succeeded());
+    if (parser.parseRBrace()) return failure();
+  }
+
+  // target { occupancy = V, warps_per_cta = V, smem_size = V }
+  if (parser.parseKeyword("target")) return failure();
+  auto targetSpec = TargetSpecAttr::parse(parser, Type{});
+  if (!targetSpec) return failure();
+  result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
+
+  // -> (result_type, ...) or -> result_type  [optional]
+  SmallVector<Type> resultTypes;
+  if (parser.parseOptionalArrow().succeeded()) {
+    if (parser.parseOptionalLParen().succeeded()) {
+      if (!parser.parseOptionalRParen().succeeded()) {
+        if (parser.parseTypeList(resultTypes) || parser.parseRParen())
+          return failure();
+      }
+    } else {
+      Type singleTy;
+      if (parser.parseType(singleTy)) return failure();
+      resultTypes.push_back(singleTy);
+    }
+  }
+
+  // Build function type from block arg types + result types
+  SmallVector<Type> inputTypes;
+  for (auto &arg : blockArgs) inputTypes.push_back(arg.type);
+  auto funcType = FunctionType::get(result.getContext(), inputTypes, resultTypes);
+  result.addAttribute(getFunctionTypeAttrName(result.name),
+                      TypeAttr::get(funcType));
+  result.addAttribute(getInputNamesAttrName(result.name),
+                      b.getArrayAttr(inputNameAttrs));
+  result.addAttribute(getOutputNamesAttrName(result.name),
+                      b.getArrayAttr(outputNameAttrs));
+
+  // Body region
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, blockArgs)) return failure();
+  FuncOp::ensureTerminator(*body, b, result.location);
+  return success();
+}
+
+void FuncOp::print(OpAsmPrinter &p) {
+  Block &block = getBodyBlock();
+  FunctionType ft = getFunctionTypeTyped();
+
+  // @name
+  p << " @" << getSymName();
+
+  // (%arg0 : type0, ...)
+  p << "(";
+  llvm::interleaveComma(block.getArguments(), p, [&](BlockArgument arg) {
+    p << arg << ": " << arg.getType();
+  });
+  p << ")";
+
+  // inputs {%arg0 = "name0", ...}
+  auto inputNames = getInputNames();
+  unsigned numInputs = inputNames.size();
+  p << " inputs {";
+  for (unsigned i = 0; i < numInputs; ++i) {
+    if (i) p << ", ";
+    p << block.getArgument(i) << " = "
+      << llvm::cast<StringAttr>(inputNames[i]);
+  }
+  p << "}";
+
+  // outputs {...}
+  auto outputNames = getOutputNames();
+  p << " outputs {";
+  if (ft.getNumResults() == 0) {
+    // void: %argN = "name" form
+    for (unsigned i = 0; i < outputNames.size(); ++i) {
+      if (i) p << ", ";
+      p << block.getArgument(numInputs + i) << " = "
+        << llvm::cast<StringAttr>(outputNames[i]);
+    }
+  } else {
+    // non-void: just "name" strings
+    llvm::interleaveComma(outputNames, p,
+                          [&](Attribute a) { p << llvm::cast<StringAttr>(a); });
+  }
+  p << "}";
+
+  // target {occupancy = ..., ...}
+  p << " target ";
+  getTargetSpec().print(p);
+
+  // -> result types (non-void)
+  auto resultTypes = ft.getResults();
+  if (!resultTypes.empty()) {
+    p << " -> ";
+    if (resultTypes.size() == 1) {
+      p << resultTypes[0];
+    } else {
+      p << "(";
+      llvm::interleaveComma(resultTypes, p);
+      p << ")";
+    }
+  }
+
+  // body
+  p << " ";
+  p.printRegion(getBody(), /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReturnOp::verify() {
+  auto funcOp = llvm::cast<FuncOp>((*this)->getParentOp());
+  FunctionType ft = funcOp.getFunctionTypeTyped();
+  auto resultTypes = ft.getResults();
+
+  if (getOperands().size() != resultTypes.size())
+    return emitOpError("returns ") << getOperands().size()
+           << " value(s) but function has " << resultTypes.size()
+           << " result type(s)";
+
+  for (auto [i, opType, resType] :
+       llvm::enumerate(getOperandTypes(), resultTypes)) {
+    if (opType != resType)
+      return emitOpError("operand #") << i << " type '" << opType
+             << "' does not match function result type '" << resType << "'";
+  }
+  return success();
+}
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 4f732ba..b101c34 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -1,6 +1,6 @@
 //===- KunIrToKunGpu.cpp - Lower kunir ops to kungpu + scf + arith --------===//
 //
-// Lowers a func.func whose body contains kunir ops into a form that uses:
+// Lowers a kunir.func whose body contains kunir ops into a form that uses:
 //   - kungpu.time_length / kungpu.ts.get / kungpu.ts.put  for ts I/O
 //   - scf.for for the outer time loop and inner back-window loops
 //   - arith.* / math.* for scalar arithmetic
@@ -21,7 +21,6 @@
 #include "KunIr/KunIrTypes.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
@@ -121,16 +120,14 @@ static LogicalResult lowerBlock(
 //===----------------------------------------------------------------------===//
 
 struct LowerKunIrToKunGpuPass
-    : PassWrapper<LowerKunIrToKunGpuPass, OperationPass<func::FuncOp>> {
+    : PassWrapper<LowerKunIrToKunGpuPass, OperationPass<kunir::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerKunIrToKunGpuPass)
   StringRef getArgument()    const override { return "kunir-to-kungpu"; }
   StringRef getDescription() const override {
     return "Lower kunir ops to kungpu + scf + arith/math"; }
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<kungpu::KunGpuDialect>();
-    registry.insert<arith::ArithDialect>();
-    registry.insert<math::MathDialect>();
-    registry.insert<scf::SCFDialect>();
+    registry.insert<kungpu::KunGpuDialect, arith::ArithDialect,
+                    math::MathDialect, scf::SCFDialect>();
   }
   void runOnOperation() override;
 };
@@ -138,7 +135,7 @@ struct LowerKunIrToKunGpuPass
 } // namespace
 
 void LowerKunIrToKunGpuPass::runOnOperation() {
-  func::FuncOp funcOp = getOperation();
+  kunir::FuncOp funcOp = getOperation();
   MLIRContext *ctx = &getContext();
   Location loc = funcOp.getLoc();
 
@@ -147,7 +144,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   // ------------------------------------------------------------------
   // 1. Extend function signature: ts return types → extra output params.
   // ------------------------------------------------------------------
-  FunctionType oldFT = funcOp.getFunctionType();
+  FunctionType oldFT = funcOp.getFunctionTypeTyped();
   SmallVector<Type> newArgTys(oldFT.getInputs());
   SmallVector<unsigned> tsRetIdx;
   for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
@@ -161,16 +158,17 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   SmallVector<Type> newRetTys;
   for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
     if (!isa<TsType>(ty)) newRetTys.push_back(ty);
-  funcOp.setFunctionType(FunctionType::get(ctx, newArgTys, newRetTys));
+  funcOp.setFunctionTypeAttr(
+      TypeAttr::get(FunctionType::get(ctx, newArgTys, newRetTys)));
 
   // ------------------------------------------------------------------
   // 2. Snapshot original ops and find the original return.
   // ------------------------------------------------------------------
   SmallVector<Operation *> origOps;
-  func::ReturnOp retOp;
+  kunir::ReturnOp retOp;
   for (Operation &op : entry) origOps.push_back(&op);
   for (Operation *op : origOps)
-    if (auto r = dyn_cast<func::ReturnOp>(op)) { retOp = r; break; }
+    if (auto r = dyn_cast<kunir::ReturnOp>(op)) { retOp = r; break; }
 
   // Collect ts return values from the original return.
   SmallVector<Value> tsRetVals;
@@ -217,7 +215,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   //    for_each_back_window, and func.return are handled by the callback.
   // ------------------------------------------------------------------
   auto outerHandler = [&](Operation &op) -> LogicalResult {
-    if (isa<func::ReturnOp>(op)) return success(); // handled in step 7
+    if (isa<kunir::ReturnOp>(op)) return success(); // handled in step 7
 
     Location ol = op.getLoc();
 
@@ -334,7 +332,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     SmallVector<Value> nonTsRets;
     for (Value v : retOp.getOperands())
       if (!isa<TsType>(v.getType())) nonTsRets.push_back(v);
-    b.create<func::ReturnOp>(loc, nonTsRets);
+    b.create<kunir::ReturnOp>(loc, mlir::ValueRange(nonTsRets));
   }
 
   // ------------------------------------------------------------------
diff --git a/mlir/test/kungpu/basic.mlir b/mlir/test/kungpu/basic.mlir
index 73469d1..2490685 100644
--- a/mlir/test/kungpu/basic.mlir
+++ b/mlir/test/kungpu/basic.mlir
@@ -1,44 +1,59 @@
 // RUN: %kun-opt %s | %FileCheck %s
 // RUN: %kun-opt %s | %kun-opt | %FileCheck %s
 
-// CHECK-LABEL: func.func @test_stock_id
-func.func @test_stock_id() -> index {
+// CHECK-LABEL: kunir.func @test_stock_id
+kunir.func @test_stock_id()
+    inputs {} outputs {"id"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> index {
   // CHECK: kungpu.stock_id
   %id = kungpu.stock_id
-  return %id : index
+  kunir.return %id : index
 }
 
-// CHECK-LABEL: func.func @test_block_stock_count
-func.func @test_block_stock_count() -> index {
+// CHECK-LABEL: kunir.func @test_block_stock_count
+kunir.func @test_block_stock_count()
+    inputs {} outputs {"n"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> index {
   // CHECK: kungpu.block_stock_count
   %n = kungpu.block_stock_count
-  return %n : index
+  kunir.return %n : index
 }
 
-// CHECK-LABEL: func.func @test_time_length
-func.func @test_time_length() -> index {
+// CHECK-LABEL: kunir.func @test_time_length
+kunir.func @test_time_length()
+    inputs {} outputs {"len"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> index {
   // CHECK: kungpu.time_length
   %len = kungpu.time_length
-  return %len : index
+  kunir.return %len : index
 }
 
-// CHECK-LABEL: func.func @test_ts_get_put
-func.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32, 1>) {
+// CHECK-LABEL: kunir.func @test_ts_get_put
+kunir.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32, 1>)
+    inputs {%ts_in = "ts_in"}
+    outputs {%ts_out = "ts_out"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} {
   %c0 = arith.constant 0 : index
   // CHECK: kungpu.ts.get
   // CHECK-SAME: <f32, inf> -> f32
   %v = kungpu.ts.get %ts_in[%c0] : !kunir.ts<f32, inf> -> f32
   // CHECK: kungpu.ts.put
   kungpu.ts.put %ts_out[%c0], %v : !kunir.ts<f32, 1>, f32
-  return
+  kunir.return
 }
 
-// CHECK-LABEL: func.func @test_windowed_temp
-func.func @test_windowed_temp() -> f32 {
+// CHECK-LABEL: kunir.func @test_windowed_temp
+kunir.func @test_windowed_temp()
+    inputs {} outputs {"v"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
   %wt = kungpu.windowed_temp : !kunir.ts<f32, 5>
   // CHECK: kungpu.ts.get %[[WT]]
   %v = kungpu.ts.get %wt[%c0] : !kunir.ts<f32, 5> -> f32
-  return %v : f32
+  kunir.return %v : f32
 }
diff --git a/mlir/test/kungpu/memory_planning.mlir b/mlir/test/kungpu/memory_planning.mlir
index 0d55b18..aad77d9 100644
--- a/mlir/test/kungpu/memory_planning.mlir
+++ b/mlir/test/kungpu/memory_planning.mlir
@@ -1,18 +1,21 @@
 // RUN: %kun-opt --kungpu-memory-planning %s | %FileCheck %s
 //
-// Default pass parameters: total_smem=49152 bytes, occupancy=1,
-//                          num_threads=32, vec=1 → budget=49152 bytes
+// All three functions share the same target_spec:
+//   smem_size = 49152 bytes (per-SM total), occupancy = 1
+//   → per-block budget = 49152 / 1 = 49152 bytes
+//   warps_per_cta = 1  →  num_threads = 32
+//   vector_size = 1
 //
-// Buffer cost (f32=4 bytes): bytes = N * 32 * 1 * 4 = N * 128
+// Buffer cost (f32 = 4 bytes): bytes = N * 32 * 1 * 4 = N * 128
 //   N=3   →   384 bytes
 //   N=5   →   640 bytes
 //   N=10  →  1280 bytes
 //   N=400 → 51200 bytes  (> 49152)
 //   N=500 → 64000 bytes  (> 49152)
 //
-// Case 1 – all smem:   N=3  (384) + N=5  (640) + N=10  (1280) = 2304  ≤ budget
-// Case 2 – mixed:      N=5  (640) → smem; N=400 (51200) → 640+51200 > budget → local
-// Case 3 – all local:  N=400 (51200) > budget → local; N=500 (64000) > budget → local
+// Case 1 – all smem:   N=3 (384) + N=5 (640) + N=10 (1280) = 2304 ≤ 49152
+// Case 2 – mixed:      N=5 (640) → smem; N=400 (51200) → 640+51200 > 49152 → local
+// Case 3 – all local:  N=400 (51200) > 49152 → local; N=500 → local
 //
 // The pass sorts ops by ascending N before assigning, so declaration order
 // in the IR does not affect the assignment.
@@ -21,8 +24,11 @@
 // Case 1: all three buffers fit in shared memory
 // -----------------------------------------------------------------------
 
-// CHECK-LABEL: func.func @test_all_smem
-func.func @test_all_smem() {
+// CHECK-LABEL: kunir.func @test_all_smem
+kunir.func @test_all_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} {
   // Declared in reverse order to verify sort-by-N behaviour.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 10> {kungpu.smem = true}
   %c = kungpu.windowed_temp : !kunir.ts<f32, 10>
@@ -30,36 +36,41 @@ func.func @test_all_smem() {
   %a = kungpu.windowed_temp : !kunir.ts<f32, 3>
   // CHECK-DAG: kungpu.windowed_temp : <f32, 5> {kungpu.smem = true}
   %b = kungpu.windowed_temp : !kunir.ts<f32, 5>
-  return
+  kunir.return
 }
 
 // -----------------------------------------------------------------------
 // Case 2: small buffer goes to smem, large buffer spills to local memory
 // -----------------------------------------------------------------------
 
-// CHECK-LABEL: func.func @test_mixed
-func.func @test_mixed() {
+// CHECK-LABEL: kunir.func @test_mixed
+kunir.func @test_mixed(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} {
   // N=400 (51200 bytes) is declared first but sorted after N=5 (640 bytes).
-  // N=5 takes 640 bytes of the 49152-byte budget; N=400 would need 51200
-  // more, which exceeds the remaining 48512 bytes → local.
+  // N=5 takes 640 bytes; N=400 would need 51200 more, exceeding 48512 remaining.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
   %big = kungpu.windowed_temp : !kunir.ts<f32, 400>
   // CHECK-DAG: kungpu.windowed_temp : <f32, 5> {kungpu.smem = true}
   %small = kungpu.windowed_temp : !kunir.ts<f32, 5>
-  return
+  kunir.return
 }
 
 // -----------------------------------------------------------------------
 // Case 3: every buffer exceeds the budget on its own → all local memory
 // -----------------------------------------------------------------------
 
-// CHECK-LABEL: func.func @test_all_local
-func.func @test_all_local() {
-  // N=400 → 51200 bytes > 49152 (budget), so smem=false.
+// CHECK-LABEL: kunir.func @test_all_local
+kunir.func @test_all_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} {
+  // N=400 → 51200 bytes > 49152, smem=false.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
   %a = kungpu.windowed_temp : !kunir.ts<f32, 400>
-  // N=500 → 64000 bytes > 49152 (budget), so smem=false.
+  // N=500 → 64000 bytes > 49152, smem=false.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 500> {kungpu.smem = false}
   %b = kungpu.windowed_temp : !kunir.ts<f32, 500>
-  return
+  kunir.return
 }
diff --git a/mlir/test/kunir/basic.mlir b/mlir/test/kunir/basic.mlir
index 0c4d336..5a84451 100644
--- a/mlir/test/kunir/basic.mlir
+++ b/mlir/test/kunir/basic.mlir
@@ -1,25 +1,29 @@
 // RUN: %kun-opt %s | %FileCheck %s
 // RUN: %kun-opt %s | %kun-opt | %FileCheck %s
 
-// Verify the kunir dialect type and ops parse and round-trip.
+// Verify the kunir dialect types and ops parse and round-trip inside kunir.func.
 
-// CHECK-LABEL: func.func @test_ts_lookback_type
-func.func @test_ts_lookback_type(
-    // CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-LABEL: kunir.func @test_ts_lookback_type
+// CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-SAME: !kunir.ts<f32, 1>
+// CHECK-SAME: !kunir.ts<f64, 10>
+kunir.func @test_ts_lookback_type(
     %a: !kunir.ts<f32, inf>,
-    // CHECK-SAME: !kunir.ts<f32, 1>
     %b: !kunir.ts<f32, 1>,
-    // CHECK-SAME: !kunir.ts<f64, 10>
-    %c: !kunir.ts<f64, 10>
-) -> !kunir.ts<f32, 1> {
-  return %b : !kunir.ts<f32, 1>
+    %c: !kunir.ts<f64, 10>)
+    inputs {%a = "a", %b = "b", %c = "c"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
+  kunir.return %b : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_binary_mismatched_lookbacks
-func.func @test_binary_mismatched_lookbacks(
-    %a: !kunir.ts<f32, 5>,
-    %b: !kunir.ts<f32, 10>
-) -> !kunir.ts<f32, 1> {
+// CHECK-LABEL: kunir.func @test_binary_mismatched_lookbacks
+kunir.func @test_binary_mismatched_lookbacks(%a: !kunir.ts<f32, 5>, %b: !kunir.ts<f32, 10>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK: kunir.add
   // CHECK-SAME: <f32, 5>, <f32, 10>
   %sum = kunir.add %a, %b : !kunir.ts<f32, 5>, !kunir.ts<f32, 10>
@@ -27,36 +31,50 @@ func.func @test_binary_mismatched_lookbacks(
   %diff = kunir.sub %a, %b : !kunir.ts<f32, 5>, !kunir.ts<f32, 10>
   // CHECK: kunir.mul
   %prod = kunir.mul %sum, %diff : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
-  return %prod : !kunir.ts<f32, 1>
+  kunir.return %prod : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_unary
-func.func @test_unary(%x: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+// CHECK-LABEL: kunir.func @test_unary
+kunir.func @test_unary(%x: !kunir.ts<f32, inf>)
+    inputs {%x = "x"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK: kunir.abs
   %a = kunir.abs %x : !kunir.ts<f32, inf>
   // CHECK: kunir.sign
   %s = kunir.sign %a : !kunir.ts<f32, 1>
-  return %s : !kunir.ts<f32, 1>
+  kunir.return %s : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_windowed_output
-func.func @test_windowed_output(%input: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 10> {
+// CHECK-LABEL: kunir.func @test_windowed_output
+kunir.func @test_windowed_output(%input: !kunir.ts<f32, inf>)
+    inputs {%input = "input"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> !kunir.ts<f32, 10> {
   // CHECK: kunir.windowed_output
   // CHECK-SAME: length = 10
   %out = kunir.windowed_output %input [length = 10] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 10>
-  return %out : !kunir.ts<f32, 10>
+  kunir.return %out : !kunir.ts<f32, 10>
 }
 
-// CHECK-LABEL: func.func @test_cs_rank
-func.func @test_cs_rank(%input: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+// CHECK-LABEL: kunir.func @test_cs_rank
+kunir.func @test_cs_rank(%input: !kunir.ts<f32, inf>)
+    inputs {%input = "input"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK: kunir.cs_rank
   %ranked = kunir.cs_rank %input : !kunir.ts<f32, inf>
-  return %ranked : !kunir.ts<f32, 1>
+  kunir.return %ranked : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_for_each_back_window_single
-// Single input, single result.
-func.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
+// CHECK-LABEL: kunir.func @test_for_each_back_window_single
+kunir.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
+    inputs {%close = "close"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
     -> !kunir.ts<f32, 1> {
   // CHECK: kunir.for_each_back_window
   // CHECK-SAME: [window = 5]
@@ -68,14 +86,16 @@ func.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
     %s = kunir.reduce_add %close_cur : !kunir.ts<f32, 1>
     kunir.yield %s : !kunir.ts<f32, 1>
   }
-  return %ts_sum : !kunir.ts<f32, 1>
+  kunir.return %ts_sum : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_for_each_back_window_multi_input
-// Two inputs, two results (one reduce per input).
-func.func @test_for_each_back_window_multi_input(
+// CHECK-LABEL: kunir.func @test_for_each_back_window_multi_input
+kunir.func @test_for_each_back_window_multi_input(
     %close: !kunir.ts<f32, 20>,
     %vol:   !kunir.ts<f32, 20>)
+    inputs {%close = "close", %vol = "vol"}
+    outputs {"sum_close", "sum_vol"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
     -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
   // CHECK: kunir.for_each_back_window
   %sum_c, %sum_v = kunir.for_each_back_window
@@ -88,12 +108,14 @@ func.func @test_for_each_back_window_multi_input(
     %sv = kunir.reduce_add %vc : !kunir.ts<f32, 1>
     kunir.yield %sc, %sv : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
   }
-  return %sum_c, %sum_v : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  kunir.return %sum_c, %sum_v : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_for_each_back_window_multi_reduce
-// Single input, multiple reductions → multiple results.
-func.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
+// CHECK-LABEL: kunir.func @test_for_each_back_window_multi_reduce
+kunir.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
+    inputs {%input = "input"}
+    outputs {"sum", "max"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
     -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
   %sum_ts, %max_ts = kunir.for_each_back_window
       (%input : !kunir.ts<f32, 20>) [window = 10]
@@ -105,12 +127,14 @@ func.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
     %m = kunir.reduce_max %val : !kunir.ts<f32, 1>
     kunir.yield %s, %m : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
   }
-  return %sum_ts, %max_ts : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  kunir.return %sum_ts, %max_ts : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_for_each_back_window_inf
-// inf lookback satisfies any window size.
-func.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
+// CHECK-LABEL: kunir.func @test_for_each_back_window_inf
+kunir.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
+    inputs {%input = "input"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
     -> !kunir.ts<f64, 1> {
   %result = kunir.for_each_back_window
       (%input : !kunir.ts<f64, inf>) [window = 100]
@@ -119,13 +143,16 @@ func.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
     %s = kunir.reduce_add %val : !kunir.ts<f64, 1>
     kunir.yield %s : !kunir.ts<f64, 1>
   }
-  return %result : !kunir.ts<f64, 1>
+  kunir.return %result : !kunir.ts<f64, 1>
 }
 
-// CHECK-LABEL: func.func @test_f64_binary
-func.func @test_f64_binary(%a: !kunir.ts<f64, inf>, %b: !kunir.ts<f64, inf>)
+// CHECK-LABEL: kunir.func @test_f64_binary
+kunir.func @test_f64_binary(%a: !kunir.ts<f64, inf>, %b: !kunir.ts<f64, inf>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
     -> !kunir.ts<f64, 1> {
   // CHECK: !kunir.ts<f64
   %result = kunir.max %a, %b : !kunir.ts<f64, inf>, !kunir.ts<f64, inf>
-  return %result : !kunir.ts<f64, 1>
+  kunir.return %result : !kunir.ts<f64, 1>
 }
diff --git a/mlir/test/kunir/func.mlir b/mlir/test/kunir/func.mlir
new file mode 100644
index 0000000..d93a95d
--- /dev/null
+++ b/mlir/test/kunir/func.mlir
@@ -0,0 +1,66 @@
+// RUN: %kun-opt %s | %FileCheck %s
+// RUN: %kun-opt %s | %kun-opt | %FileCheck %s
+
+// CHECK-LABEL: kunir.func @test_non_void
+// CHECK-SAME: (%[[A:.*]]: !kunir.ts<f32, inf>, %[[B:.*]]: !kunir.ts<f32, inf>)
+// CHECK:      inputs {%[[A]] = "close", %[[B]] = "vol"}
+// CHECK:      outputs {"alpha"}
+// CHECK:      target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+// CHECK:      -> !kunir.ts<f32, 1>
+kunir.func @test_non_void(%close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>)
+    inputs {%close = "close", %vol = "vol"}
+    outputs {"alpha"}
+    target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
+  %sum = kunir.add %close, %vol : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  kunir.return %sum : !kunir.ts<f32, 1>
+}
+
+// Void form: one input, one output — both are function args.
+// CHECK-LABEL: kunir.func @test_void
+// CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f32, inf>, %[[OUT:.*]]: !kunir.ts<f32, 1>)
+// CHECK:      inputs {%[[IN]] = "close"}
+// CHECK:      outputs {%[[OUT]] = "alpha"}
+// CHECK:      target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1}
+// CHECK-NOT:  ->
+kunir.func @test_void(%close: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%close = "close"}
+    outputs {%out = "alpha"}
+    target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1} {
+  kunir.return
+}
+
+// Void form: two inputs, two outputs — all four are function args.
+// CHECK-LABEL: kunir.func @test_void_multi_output
+// CHECK-SAME: (%[[I0:.*]]: !kunir.ts<f32, inf>, %[[I1:.*]]: !kunir.ts<f32, inf>, %[[O0:.*]]: !kunir.ts<f32, 1>, %[[O1:.*]]: !kunir.ts<f32, 1>)
+// CHECK:      inputs {%[[I0]] = "close", %[[I1]] = "vol"}
+// CHECK:      outputs {%[[O0]] = "alpha1", %[[O1]] = "alpha2"}
+// CHECK:      target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+// CHECK-NOT:  ->
+kunir.func @test_void_multi_output(
+    %close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>,
+    %out1: !kunir.ts<f32, 1>, %out2: !kunir.ts<f32, 1>)
+    inputs {%close = "close", %vol = "vol"}
+    outputs {%out1 = "alpha1", %out2 = "alpha2"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} {
+  kunir.return
+}
+
+// Non-void multi-result.
+// CHECK-LABEL: kunir.func @test_multi_result
+kunir.func @test_multi_result(%input: !kunir.ts<f64, inf>)
+    inputs {%input = "input"}
+    outputs {"sum", "maxval"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 16384, vector_size = 1}
+    -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+  %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
+  %s, %m = kunir.for_each_back_window
+      (%w : !kunir.ts<f64, 10>) [window = 10]
+      (%val : !kunir.ts<f64, 1>)
+      -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+    %radd = kunir.reduce_add %val : !kunir.ts<f64, 1>
+    %rmax = kunir.reduce_max %val : !kunir.ts<f64, 1>
+    kunir.yield %radd, %rmax : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+  }
+  kunir.return %s, %m : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+}
diff --git a/mlir/test/kunir/lower_to_kungpu.mlir b/mlir/test/kunir/lower_to_kungpu.mlir
index f5f2994..e9241a1 100644
--- a/mlir/test/kunir/lower_to_kungpu.mlir
+++ b/mlir/test/kunir/lower_to_kungpu.mlir
@@ -1,14 +1,15 @@
 // RUN: %kun-opt --kunir-to-kungpu %s | %FileCheck %s
 
-// CHECK-LABEL: func.func @test_binary_lower
+// CHECK-LABEL: kunir.func @test_binary_lower
 // CHECK-SAME: !kunir.ts<f32, inf>
 // CHECK-SAME: !kunir.ts<f32, inf>
 // CHECK-SAME: !kunir.ts<f32, 1>
 // CHECK-NOT: -> !kunir.ts
-func.func @test_binary_lower(
-    %a: !kunir.ts<f32, inf>,
-    %b: !kunir.ts<f32, inf>
-) -> !kunir.ts<f32, 1> {
+kunir.func @test_binary_lower(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK:      %[[TL:.*]] = kungpu.time_length
   // CHECK:      %[[C0:.*]] = arith.constant 0 : index
   // CHECK:      %[[C1:.*]] = arith.constant 1 : index
@@ -18,23 +19,31 @@ func.func @test_binary_lower(
   // CHECK:        arith.addf
   // CHECK:        kungpu.ts.put
   %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
-  return %sum : !kunir.ts<f32, 1>
+  kunir.return %sum : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_unary_lower
-func.func @test_unary_lower(%x: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+// CHECK-LABEL: kunir.func @test_unary_lower
+kunir.func @test_unary_lower(%x: !kunir.ts<f32, inf>)
+    inputs {%x = "x"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK: math.absf
   %a = kunir.abs %x : !kunir.ts<f32, inf>
-  return %a : !kunir.ts<f32, 1>
+  kunir.return %a : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_windowed_sum
-func.func @test_windowed_sum(%close: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
+// CHECK-LABEL: kunir.func @test_windowed_sum
+kunir.func @test_windowed_sum(%close: !kunir.ts<f32, inf>)
+    inputs {%close = "close"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK:      %[[C0:.*]] = arith.constant 0 : index
   // CHECK:      %[[C1:.*]] = arith.constant 1 : index
   // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
   // CHECK:      scf.for %[[T:.*]] =
-  // CHECK:        kungpu.ts.get %arg0[%[[T]]]
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[T]]]
   // CHECK:        kungpu.ts.put %[[WT]][%[[T]]]
   // CHECK:        %[[WIN:.*]] = arith.constant 5 : index
   // CHECK:        scf.for %{{.*}} = %[[C0]] to %[[WIN]] step %[[C1]] iter_args
@@ -48,14 +57,15 @@ func.func @test_windowed_sum(%close: !kunir.ts<f32, inf>) -> !kunir.ts<f32, 1> {
     %s = kunir.reduce_add %cur : !kunir.ts<f32, 1>
     kunir.yield %s : !kunir.ts<f32, 1>
   }
-  return %sum : !kunir.ts<f32, 1>
+  kunir.return %sum : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_computed_reduce
-func.func @test_computed_reduce(
-    %x: !kunir.ts<f32, inf>,
-    %y: !kunir.ts<f32, inf>
-) -> !kunir.ts<f32, 1> {
+// CHECK-LABEL: kunir.func @test_computed_reduce
+kunir.func @test_computed_reduce(%x: !kunir.ts<f32, inf>, %y: !kunir.ts<f32, inf>)
+    inputs {%x = "x", %y = "y"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    -> !kunir.ts<f32, 1> {
   // CHECK:      %[[WX:.*]] = kungpu.windowed_temp : <f32, 3>
   // CHECK:      %[[WY:.*]] = kungpu.windowed_temp : <f32, 3>
   // CHECK:      scf.for
@@ -74,12 +84,16 @@ func.func @test_computed_reduce(
     %s = kunir.reduce_add %prod : !kunir.ts<f32, 1>
     kunir.yield %s : !kunir.ts<f32, 1>
   }
-  return %sum : !kunir.ts<f32, 1>
+  kunir.return %sum : !kunir.ts<f32, 1>
 }
 
-// CHECK-LABEL: func.func @test_multi_reduce
+// CHECK-LABEL: kunir.func @test_multi_reduce
 // CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f64, inf>, %[[OUT0:.*]]: !kunir.ts<f64, 1>, %[[OUT1:.*]]: !kunir.ts<f64, 1>)
-func.func @test_multi_reduce(%input: !kunir.ts<f64, inf>) -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+kunir.func @test_multi_reduce(%input: !kunir.ts<f64, inf>)
+    inputs {%input = "input"}
+    outputs {"sum", "maxval"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
   // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f64, 10>
   // CHECK:      scf.for %[[T:.*]] =
   // CHECK:        kungpu.ts.get %[[IN]][%[[T]]]
@@ -102,5 +116,5 @@ func.func @test_multi_reduce(%input: !kunir.ts<f64, inf>) -> (!kunir.ts<f64, 1>,
     %m = kunir.reduce_max %val : !kunir.ts<f64, 1>
     kunir.yield %s, %m : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
   }
-  return %sum, %max : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+  kunir.return %sum, %max : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
 }

From dd459e587a4f63ef4724235c9a165fa55a19e909 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 6 May 2026 20:47:43 -0700
Subject: [PATCH 05/59] kungpu to llvm

---
 mlir/Tools/kun-opt/CMakeLists.txt        |   3 +
 mlir/Tools/kun-opt/kun-opt.cpp           |   4 +
 mlir/include/KunGpu/KunGpuOps.td         |  50 ++-
 mlir/include/KunGpu/KunGpuUtils.h        |  60 +++
 mlir/include/KunGpu/Passes.h             |   1 +
 mlir/include/KunGpu/Passes.td            |  39 ++
 mlir/lib/KunGpu/CMakeLists.txt           |   6 +
 mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp |   5 +-
 mlir/lib/KunGpu/KunGpuToLLVM.cpp         | 493 +++++++++++++++++++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp         |  55 ++-
 mlir/test/kungpu/basic.mlir              |  10 +-
 mlir/test/kungpu/lower_to_llvm.mlir      | 216 ++++++++++
 mlir/test/kunir/lower_to_kungpu.mlir     |  35 +-
 13 files changed, 924 insertions(+), 53 deletions(-)
 create mode 100644 mlir/include/KunGpu/KunGpuUtils.h
 create mode 100644 mlir/lib/KunGpu/KunGpuToLLVM.cpp
 create mode 100644 mlir/test/kungpu/lower_to_llvm.mlir

diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
index ed343b5..b35c5ac 100644
--- a/mlir/Tools/kun-opt/CMakeLists.txt
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -15,9 +15,12 @@ target_link_libraries(kun-opt PRIVATE
 
   # Standard dialects used inside kunir/kungpu IR
   MLIRFuncDialect
+  MLIRFuncTransforms
   MLIRArithDialect
   MLIRMathDialect
   MLIRSCFDialect
+  MLIRGPUDialect
+  MLIRLLVMDialect
 
   # Core MLIR libraries
   MLIRIR
diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
index 1c818a3..2181951 100644
--- a/mlir/Tools/kun-opt/kun-opt.cpp
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -1,5 +1,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/DialectRegistry.h"
@@ -20,6 +22,8 @@ int main(int argc, char **argv) {
   registry.insert<mlir::arith::ArithDialect>();
   registry.insert<mlir::math::MathDialect>();
   registry.insert<mlir::scf::SCFDialect>();
+  registry.insert<mlir::gpu::GPUDialect>();
+  registry.insert<mlir::LLVM::LLVMDialect>();
 
   // KunQuant dialects
   registry.insert<kunir::KunIrDialect>();
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
index 82fb2e6..7310d9a 100644
--- a/mlir/include/KunGpu/KunGpuOps.td
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -57,40 +57,68 @@ def KunGpu_WindowedTempOp : KunGpu_Op<"windowed_temp", [Pure]> {
     length are encoded in the result type `!kunir.ts<elemType, N>`.
     The buffer is used as the backing store for a windowed reduction;
     it must be the `ts` operand of `ts.get` or `ts.put`.
+
+    Memory placement is decided by the kungpu-memory-planning pass, which
+    sets a discardable `kungpu.smem` boolean attribute (true → shared
+    memory, false / absent → local).  Use `isSmem()` / `setSmem(bool)`
+    rather than reading the attribute by name.
   }];
   let results = (outs KunIr_AnyTs:$result);
   let assemblyFormat = "`:` type($result) attr-dict";
+  let extraClassDeclaration = [{
+    /// Memory-planning result: true if the buffer should live in shared
+    /// memory.  Defaults to false (local) when the attribute is absent.
+    bool isSmem() {
+      auto a = (*this)->getAttrOfType<::mlir::BoolAttr>("kungpu.smem");
+      return a && a.getValue();
+    }
+    /// Set the memory-placement flag (used by kungpu-memory-planning).
+    void setSmem(bool v) {
+      (*this)->setAttr("kungpu.smem",
+                        ::mlir::BoolAttr::get(getContext(), v));
+    }
+  }];
 }
 
 def KunGpu_TsGetOp : KunGpu_Op<"ts.get", [Pure]> {
-  let summary = "Load a scalar element from a time series at a given time index";
+  let summary = "Read a scalar from a time series at a tail-relative offset";
   let description = [{
-    Loads the per-stock element from time series `ts` at time index `time`.
-    The result type must equal the element type of `ts`.
+    Reads the per-stock element from time series `ts` at `offset` steps back
+    from the tail.  The tail is the most recently-written position:
+      offset = 0  →  latest value (just written by the most recent put)
+      offset = 1  →  one step earlier
+      offset = k  →  k steps earlier  (must be < ts.maxLookback for windowed_temp)
+
+    `offset` is i32 (64-bit ops are slow on GPUs).
+    Result type must equal the element type of `ts`.
 
     Example:
-      %v = kungpu.ts.get %close[%t] : !kunir.ts<f32, inf> -> f32
+      %v   = kungpu.ts.get %close[%c0]  : !kunir.ts<f32, inf> -> f32
+      %old = kungpu.ts.get %wt[%c2_i32] : !kunir.ts<f32, 5>  -> f32
   }];
-  let arguments = (ins KunIr_AnyTs:$ts, Index:$time);
+  let arguments = (ins KunIr_AnyTs:$ts, I32:$offset);
   let results = (outs AnyFloat:$result);
   let hasVerifier = 1;
   let assemblyFormat =
-    "$ts `[` $time `]` `:` type($ts) `->` type($result) attr-dict";
+    "$ts `[` $offset `]` `:` type($ts) `->` type($result) attr-dict";
 }
 
 def KunGpu_TsPutOp : KunGpu_Op<"ts.put"> {
-  let summary = "Store a scalar value into a time series at a given time index";
+  let summary = "Append a scalar to the tail of a time series";
   let description = [{
-    Stores scalar `value` into time series `ts` at time index `time`.
+    Appends scalar `value` as the new tail of time series `ts`.  No offset:
+    a put always goes to the next writable position (advancing the tail).
+    Subsequent ts.get on the same `ts` with offset = 0 will see this value.
     `value` must have the same type as the element type of `ts`.
 
     Example:
-      kungpu.ts.put %out[%t], %v : !kunir.ts<f32, 1>, f32
+      kungpu.ts.put %wt,  %v : !kunir.ts<f32, 5>, f32
+      kungpu.ts.put %out, %v : !kunir.ts<f32, 1>, f32
   }];
-  let arguments = (ins KunIr_AnyTs:$ts, Index:$time, AnyFloat:$value);
+  let arguments = (ins KunIr_AnyTs:$ts, AnyFloat:$value);
   let hasVerifier = 1;
   let assemblyFormat =
-    "$ts `[` $time `]` `,` $value `:` type($ts) `,` type($value) attr-dict";
+    "$ts `,` $value `:` type($ts) `,` type($value) attr-dict";
 }
 
 #endif // KUNGPU_OPS_TD
diff --git a/mlir/include/KunGpu/KunGpuUtils.h b/mlir/include/KunGpu/KunGpuUtils.h
new file mode 100644
index 0000000..8258df7
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuUtils.h
@@ -0,0 +1,60 @@
+//===- KunGpuUtils.h - Lookup helpers for kungpu metadata on func ops ----===//
+//
+// After convert-kungpu-to-llvm phase 1 lowers `kunir.func` to `func.func`,
+// the original kunir.func metadata (target spec, input/output names) is
+// preserved as discardable attributes on the new func.func.  Use these
+// accessors instead of reading attributes by name in callers — they are
+// the func.func equivalents of `kunir::FuncOp::getTargetSpec` etc.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "KunIr/KunIrAttrs.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace kungpu {
+
+/// Discardable-attribute names used to attach kunir.func metadata to a
+/// func.func after phase 1 of convert-kungpu-to-llvm.
+constexpr llvm::StringLiteral kFuncTargetSpecAttr  = "kungpu.target_spec";
+constexpr llvm::StringLiteral kFuncInputNamesAttr  = "kungpu.input_names";
+constexpr llvm::StringLiteral kFuncOutputNamesAttr = "kungpu.output_names";
+
+/// Read the target_spec attribute from a func.func (lowered from a
+/// kunir.func).  Returns null if the attribute is missing.
+inline ::kunir::TargetSpecAttr
+getFuncTargetSpec(::mlir::func::FuncOp fn) {
+  return fn->getAttrOfType<::kunir::TargetSpecAttr>(kFuncTargetSpecAttr);
+}
+inline void setFuncTargetSpec(::mlir::func::FuncOp fn,
+                                ::kunir::TargetSpecAttr spec) {
+  fn->setAttr(kFuncTargetSpecAttr, spec);
+}
+
+/// Read the input_names array attribute from a func.func.  The array
+/// contains one StringAttr per ts input parameter; null if missing.
+inline ::mlir::ArrayAttr
+getFuncInputNames(::mlir::func::FuncOp fn) {
+  return fn->getAttrOfType<::mlir::ArrayAttr>(kFuncInputNamesAttr);
+}
+inline void setFuncInputNames(::mlir::func::FuncOp fn,
+                                ::mlir::ArrayAttr names) {
+  fn->setAttr(kFuncInputNamesAttr, names);
+}
+
+/// Read the output_names array attribute from a func.func.  The array
+/// contains one StringAttr per ts output (function arg in void form, or
+/// result in non-void form); null if missing.
+inline ::mlir::ArrayAttr
+getFuncOutputNames(::mlir::func::FuncOp fn) {
+  return fn->getAttrOfType<::mlir::ArrayAttr>(kFuncOutputNamesAttr);
+}
+inline void setFuncOutputNames(::mlir::func::FuncOp fn,
+                                 ::mlir::ArrayAttr names) {
+  fn->setAttr(kFuncOutputNamesAttr, names);
+}
+
+} // namespace kungpu
diff --git a/mlir/include/KunGpu/Passes.h b/mlir/include/KunGpu/Passes.h
index 0a727a0..cf47230 100644
--- a/mlir/include/KunGpu/Passes.h
+++ b/mlir/include/KunGpu/Passes.h
@@ -12,6 +12,7 @@ namespace kungpu {
 #include "KunGpu/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass();
+std::unique_ptr<mlir::Pass> createConvertKunGpuToLLVMPass();
 
 #define GEN_PASS_REGISTRATION
 #include "KunGpu/Passes.h.inc"
diff --git a/mlir/include/KunGpu/Passes.td b/mlir/include/KunGpu/Passes.td
index d3bc7f1..d4f9167 100644
--- a/mlir/include/KunGpu/Passes.td
+++ b/mlir/include/KunGpu/Passes.td
@@ -25,4 +25,43 @@ def WindowedTempMemoryPlanning
   let dependentDialects = ["::kungpu::KunGpuDialect"];
 }
 
+def ConvertKunGpuToLLVM
+    : Pass<"convert-kungpu-to-llvm", "::mlir::ModuleOp"> {
+  let summary = "Lower kungpu ops and kunir.func to func.func + LLVM/GPU dialects";
+  let description = [{
+    Converts the KunGpu dialect to a mix of func, LLVM, GPU, arith, and scf dialects.
+
+    Type conversion:
+      !kunir.ts<T,N> → !llvm.ptr
+
+    Function signature:
+      kunir.func @f(%a: !kunir.ts<T,N>, ...) → func.func @f(i64, i64, !llvm.ptr, ...)
+      The two prepended i64 arguments are time_len and num_stocks.
+
+    Memory ops:
+      kungpu.windowed_temp → llvm.alloca circular buffer + head-state alloca
+      kungpu.ts.get        → GEP + llvm.load
+      kungpu.ts.put        → advance circular head + GEP + llvm.store
+      Global ts (function arg): TxS layout — element(t,s) = base + t*num_stocks + s
+      windowed_temp: circular index without modulo (compare + conditional subtract)
+
+    Shared memory (kungpu.smem = true):
+      A module-level llvm.mlir.global with addr_space=3 is emitted once per
+      windowed_temp; each thread accesses its own N-element slice at
+      offset threadIdx.x * N.
+
+    Thread indexing:
+      kungpu.time_length       → arith.index_cast %time_len : i64 to index
+      kungpu.stock_id          → blockIdx.x * blockDim.x + threadIdx.x
+      kungpu.block_stock_count → blockDim.x
+  }];
+  let constructor = "::kungpu::createConvertKunGpuToLLVMPass()";
+  let dependentDialects = [
+    "::mlir::func::FuncDialect",
+    "::mlir::LLVM::LLVMDialect",
+    "::mlir::arith::ArithDialect",
+    "::mlir::gpu::GPUDialect"
+  ];
+}
+
 #endif // KUNGPU_PASSES_TD
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index 8ac3898..db9c05b 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   KunGpuDialect.cpp
   KunGpuOps.cpp
   KunGpuMemoryPlanning.cpp
+  KunGpuToLLVM.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/mlir/include
@@ -15,6 +16,11 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRIR
   MLIRPass
   MLIRFuncDialect
+  MLIRFuncTransforms
+  MLIRArithDialect
+  MLIRGPUDialect
+  MLIRLLVMDialect
+  MLIRTransformUtils
   MLIRSideEffectInterfaces
   MLIRKunIrDialect
 )
diff --git a/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
index c6ae579..74fb79f 100644
--- a/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
+++ b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
@@ -64,7 +64,6 @@ struct WindowedTempMemoryPlanningPass
 
   void runOnOperation() override {
     kunir::FuncOp funcOp = getOperation();
-    MLIRContext *ctx = &getContext();
 
     // -----------------------------------------------------------------------
     // 1. Read hardware parameters from target_spec.
@@ -106,7 +105,7 @@ struct WindowedTempMemoryPlanningPass
 
       // Infinite-lookback buffers cannot be sized statically → always local.
       if (N == std::numeric_limits<uint64_t>::max()) {
-        op->setAttr("kungpu.smem", BoolAttr::get(ctx, false));
+        op.setSmem(false);
         continue;
       }
 
@@ -118,7 +117,7 @@ struct WindowedTempMemoryPlanningPass
       if (useSmem)
         usedSmem += bytes;
 
-      op->setAttr("kungpu.smem", BoolAttr::get(ctx, useSmem));
+      op.setSmem(useSmem);
 
       LLVM_DEBUG(llvm::dbgs()
                  << "[kungpu-memory-planning] windowed_temp N=" << N
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
new file mode 100644
index 0000000..54072f4
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -0,0 +1,493 @@
+//===- KunGpuToLLVM.cpp - Lower kungpu + kunir.func → func + LLVM ---------===//
+//
+// Two-phase pass.
+//
+// Phase 1 (convertFuncSignature, simple imperative helper):
+//   kunir.func @f(%a: !kunir.ts<…>, …)
+//     → func.func @f(%t: i32, %n: i32, %a: !kunir.ts<…>, …)
+//   The two prepended i32 arguments are time_length and num_stocks
+//   (i32 because 64-bit ops are slow on GPUs; the linear gmem address
+//   is still computed in i64).  ts arg types are preserved.
+//   target_spec, input_names and output_names are moved to discardable
+//   attributes (see KunGpuUtils.h accessors).
+//   kunir.return → func.return.
+//
+// Phase 2 (applyPartialConversion, one OpConversionPattern per op):
+//   TypeConverter:  !kunir.ts<T,N> → !llvm.ptr
+//
+// Op semantics (post-redesign):
+//   ts.put %ts, %v          : append %v at the tail of %ts.
+//   ts.get %ts[%offset_i32] : read %ts at tail-relative offset (0 = latest).
+//
+// Lowering of windowed_temp head state — single i32 alloca holding the
+// next-writable position (modeled on cpp/Kun/Ops.hpp::OutputWindow):
+//
+//   on put(v):  buf[pos] = v;
+//               pos = (pos + 1 >= N) ? 0 : pos + 1;     // no modulo
+//
+//   on get(off):
+//               adj = off + 1;                          // off=0 → most-recent put
+//               idx = (pos >= adj) ? pos - adj : pos + N - adj;
+//               return buf[idx];
+//
+// Lowering for global ts (function-arg pointer, TxS layout):
+//   the "tail" is the current time step, given by the enclosing scf.for iv.
+//   put :   gmem[iv * num_stocks + sid]            = v
+//   get :   load gmem[(iv - off) * num_stocks + sid]
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunGpu/KunGpuOps.h"
+#include "KunGpu/KunGpuUtils.h"
+#include "KunGpu/Passes.h"
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GEN_PASS_DEF_CONVERTKUNGPUTOLLVM
+#include "KunGpu/Passes.h.inc"
+
+using namespace mlir;
+using namespace kunir;
+using namespace kungpu;
+
+namespace {
+
+// Per-windowed_temp side state.
+//   posPtr — i32 alloca holding the next-writable circular position.
+//   stride — slot stride in bytes-of-T units:
+//              1 for local (alloca buffer is per-thread)
+//              K for shared (slot-major across the K threads in a block);
+//                K = warps_per_cta * 32, captured as an i32 SSA value.
+// Keyed on the original windowed_temp result Value so the ts.get / ts.put
+// patterns can find it.
+struct WTDesc {
+  Value posPtr;
+  int64_t stride; // 1 → no multiply at access time
+};
+using WTDescMap = llvm::DenseMap<Value, WTDesc>;
+
+//===----------------------------------------------------------------------===//
+// Phase 1: kunir.func → func.func (signature only)
+//===----------------------------------------------------------------------===//
+
+static void convertFuncSignature(kunir::FuncOp fn) {
+  auto *ctx = fn.getContext();
+  Location loc = fn.getLoc();
+  auto i32Ty = IntegerType::get(ctx, 32);
+
+  FunctionType oldFT = fn.getFunctionTypeTyped();
+  SmallVector<Type> newArgTypes = {i32Ty, i32Ty};
+  for (Type t : oldFT.getInputs())
+    newArgTypes.push_back(t);
+
+  OpBuilder b(fn);
+  auto newFunc = b.create<func::FuncOp>(
+      loc, fn.getSymName(), FunctionType::get(ctx, newArgTypes, {}));
+  newFunc.setVisibility(SymbolTable::Visibility::Public);
+  setFuncTargetSpec (newFunc, fn.getTargetSpecAttr());
+  setFuncInputNames (newFunc, fn.getInputNames());
+  setFuncOutputNames(newFunc, fn.getOutputNames());
+
+  newFunc.getBody().takeBody(fn.getBody());
+  Block &entry = newFunc.getBody().front();
+  entry.insertArgument(0u, i32Ty, loc);
+  entry.insertArgument(1u, i32Ty, loc);
+
+  SmallVector<kunir::ReturnOp> returns;
+  newFunc.walk([&](kunir::ReturnOp r) { returns.push_back(r); });
+  for (kunir::ReturnOp r : returns) {
+    OpBuilder rb(r);
+    rb.create<func::ReturnOp>(r.getLoc());
+    r.erase();
+  }
+  fn.erase();
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers used inside conversion patterns
+//===----------------------------------------------------------------------===//
+
+static Value emitStockId(OpBuilder &b, Location loc, Type idxTy) {
+  Value tid  = b.create<gpu::ThreadIdOp>(loc, idxTy, gpu::Dimension::x);
+  Value bid  = b.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::x);
+  Value bdim = b.create<gpu::BlockDimOp>(loc, idxTy, gpu::Dimension::x);
+  return b.create<arith::AddIOp>(
+      loc, b.create<arith::MulIOp>(loc, bid, bdim), tid);
+}
+
+// Read num_stocks (i32 func arg[1]) sign-extended to i64 for the linear gmem
+// address computation.  The bare i32 value is in arg[1]; we extend at every
+// use site (cheap, and lets the caller decide).
+static Value getNumStocksI64(OpBuilder &b, Operation *op, Location loc) {
+  Value ns32 = op->getParentOfType<func::FuncOp>()
+                   .getBody().front().getArgument(1);
+  return b.create<arith::ExtSIOp>(loc, b.getI64Type(), ns32);
+}
+static Value getCurrentTimeIdx(Operation *op) {
+  auto fOp = op->getParentOfType<scf::ForOp>();
+  return fOp ? fOp.getInductionVar() : Value();
+}
+
+// linear gmem address = base + (timeIdx - offsetIdx) * num_stocks + stock_id
+static Value gmemGEPWithOffset(OpBuilder &b, Location loc, Type elemTy,
+                                LLVM::LLVMPointerType ptrTy, Value basePt,
+                                Value timeIdx, Value offsetIdx,
+                                Value numStocksI64, Type idxTy, Type i64Ty) {
+  Value effIdx = offsetIdx ? b.create<arith::SubIOp>(loc, timeIdx, offsetIdx).getResult()
+                            : timeIdx;
+  Value tI64   = b.create<arith::IndexCastOp>(loc, i64Ty, effIdx);
+  Value sid    = emitStockId(b, loc, idxTy);
+  Value sidI64 = b.create<arith::IndexCastOp>(loc, i64Ty, sid);
+  Value lin    = b.create<arith::AddIOp>(
+      loc, b.create<arith::MulIOp>(loc, tI64, numStocksI64), sidI64);
+  return b.create<LLVM::GEPOp>(loc, ptrTy, elemTy, basePt, ValueRange{lin});
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
+
+struct TimeLengthPattern : OpConversionPattern<TimeLengthOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(TimeLengthOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value tl32 = op->getParentOfType<func::FuncOp>()
+                     .getBody().front().getArgument(0);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(
+        op, rewriter.getIndexType(), tl32);
+    return success();
+  }
+};
+
+struct StockIdPattern : OpConversionPattern<StockIdOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(StockIdOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op,
+        emitStockId(rewriter, op.getLoc(), rewriter.getIndexType()));
+    return success();
+  }
+};
+
+struct BlockStockCountPattern : OpConversionPattern<BlockStockCountOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(BlockStockCountOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<gpu::BlockDimOp>(
+        op, rewriter.getIndexType(), gpu::Dimension::x);
+    return success();
+  }
+};
+
+// Each windowed_temp lowers to:
+//   %buf = llvm.alloca N x elemTy   (or smem GEP slice)
+//   %pos = llvm.alloca 1 x i32
+//   llvm.store 0 : i32, %pos        (next writable position starts at 0)
+struct WindowedTempPattern : OpConversionPattern<WindowedTempOp> {
+  WTDescMap &descMap;
+  int &smemCounter;
+
+  WindowedTempPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m, int &sc)
+      : OpConversionPattern(tc, ctx), descMap(m), smemCounter(sc) {}
+
+  LogicalResult
+  matchAndRewrite(WindowedTempOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx     = op.getContext();
+    Location loc  = op.getLoc();
+    auto i32Ty    = rewriter.getI32Type();
+    auto idxTy    = rewriter.getIndexType();
+    auto ptrTy    = LLVM::LLVMPointerType::get(ctx);
+
+    auto tsTy   = llvm::cast<TsType>(op.getType());
+    int64_t N   = static_cast<int64_t>(tsTy.getMaxLookback());
+    Type elemTy = tsTy.getElementType();
+
+    // Buffer (alloca or smem slice).  All counters/offsets are i32.
+    //
+    // Local memory:
+    //   bufPtr = alloca [N x T]  (per-thread, contiguous)  — stride = 1
+    //
+    // Shared memory (slot-major, bank-conflict-free):
+    //   global [N * K x T] (addr_space=3) where K = threads_per_block
+    //   Slot j of thread t lives at index   j * K + t.
+    //   bufPtr = smem + tid                                 — stride = K
+    //   ts.put/get use bufPtr[idx * K], landing on
+    //     smem + tid + idx*K = slot_idx*K + tid  (correct).
+    Value bufPtr;
+    int64_t stride;
+
+    if (op.isSmem()) {
+      auto fn = op->getParentOfType<func::FuncOp>();
+      auto module = op->getParentOfType<ModuleOp>();
+      auto tsAttr = getFuncTargetSpec(fn);
+      int64_t blockSize = tsAttr ? (tsAttr.getWarpsPerCta() * 32) : 32;
+      stride = blockSize;
+
+      std::string name =
+          ("__smem_" + fn.getSymName() + "_" +
+           llvm::Twine(smemCounter++)).str();
+      {
+        OpBuilder::InsertionGuard g(rewriter);
+        Block *modBody = module.getBody();
+        rewriter.setInsertionPoint(modBody, modBody->begin());
+        rewriter.create<LLVM::GlobalOp>(
+            loc, LLVM::LLVMArrayType::get(elemTy, N * blockSize), false,
+            LLVM::Linkage::Internal, name, Attribute{}, 0, 3);
+      }
+      Value raw = rewriter.create<LLVM::AddressOfOp>(
+          loc, LLVM::LLVMPointerType::get(ctx, 3), name);
+      Value gen    = rewriter.create<LLVM::AddrSpaceCastOp>(loc, ptrTy, raw);
+      Value tid    = rewriter.create<gpu::ThreadIdOp>(loc, idxTy, gpu::Dimension::x);
+      Value tidI32 = rewriter.create<arith::IndexCastOp>(loc, i32Ty, tid);
+      // bufPtr = smem + tid  (slot-major: slot j thread t lives at j*K + t)
+      bufPtr = rewriter.create<LLVM::GEPOp>(loc, ptrTy, elemTy, gen,
+                                             ValueRange{tidI32});
+    } else {
+      stride = 1;
+      Value nCst = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      bufPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, elemTy, nCst);
+    }
+
+    // Single i32 cell tracking next-writable position; init to 0.
+    Value c1_i32 = rewriter.create<LLVM::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value posPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, i32Ty, c1_i32);
+    Value zeroI32 = rewriter.create<LLVM::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    rewriter.create<LLVM::StoreOp>(loc, zeroI32, posPtr);
+
+    // Side state, keyed on the original (pre-replacement) ts Value.
+    descMap[op.getResult()] = {posPtr, stride};
+
+    rewriter.replaceOp(op, bufPtr);
+    return success();
+  }
+};
+
+// Multiply an i32 index by a compile-time stride.  stride==1 is a no-op.
+static Value applyStride(OpBuilder &b, Location loc, Value idx, int64_t stride,
+                          Type i32Ty) {
+  if (stride == 1)
+    return idx;
+  Value k = b.create<LLVM::ConstantOp>(loc, i32Ty,
+                                        b.getI32IntegerAttr(stride));
+  return b.create<LLVM::MulOp>(loc, idx, k);
+}
+
+struct TsGetPattern : OpConversionPattern<TsGetOp> {
+  WTDescMap &descMap;
+
+  TsGetPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m)
+      : OpConversionPattern(tc, ctx), descMap(m) {}
+
+  LogicalResult
+  matchAndRewrite(TsGetOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto i64Ty   = rewriter.getI64Type();
+    auto idxTy   = rewriter.getIndexType();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+    Type elemTy  = op.getType();
+
+    Value tsPtr     = adaptor.getTs();      // !llvm.ptr
+    Value offsetI32 = adaptor.getOffset();  // i32
+
+    auto it = descMap.find(op.getTs());
+    if (it != descMap.end()) {
+      // ── windowed_temp: circular get without modulo ────────────────
+      //   adj = offset + 1                  (offset=0 → most-recent put)
+      //   idx = pos >= adj ? pos - adj : pos + N - adj
+      //   return buf[idx * stride]
+      const WTDesc &desc = it->second;
+      int64_t N = static_cast<int64_t>(
+          llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
+      Value pos    = rewriter.create<LLVM::LoadOp>(loc, i32Ty, desc.posPtr);
+      Value c1     = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value nCst   = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      Value adj    = rewriter.create<LLVM::AddOp>(loc, offsetI32, c1);
+      Value cmp    = rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::uge,
+                                                    pos, adj);
+      Value posMinusAdj = rewriter.create<LLVM::SubOp>(loc, pos, adj);
+      Value posPlusN    = rewriter.create<LLVM::AddOp>(loc, pos, nCst);
+      Value wrapped     = rewriter.create<LLVM::SubOp>(loc, posPlusN, adj);
+      Value idx32       = rewriter.create<LLVM::SelectOp>(
+          loc, cmp, posMinusAdj, wrapped);
+      // LLVM GEP accepts any integer index type — keep it i32 to avoid the
+      // 64-bit ops that are slow on GPUs.
+      Value gepIdx = applyStride(rewriter, loc, idx32, desc.stride, i32Ty);
+      Value gep = rewriter.create<LLVM::GEPOp>(
+          loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
+      rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
+    } else {
+      // ── global ts (function arg, TxS layout) ──────────────────────
+      //   effective time = (enclosing scf.for iv) − offset
+      //   load gmem[effTime * num_stocks + stock_id]
+      Value timeIdx = getCurrentTimeIdx(op);
+      Value offsetIdx = rewriter.create<arith::IndexCastOp>(
+          loc, idxTy, offsetI32);
+      Value gep = gmemGEPWithOffset(rewriter, loc, elemTy, ptrTy, tsPtr,
+                                     timeIdx, offsetIdx,
+                                     getNumStocksI64(rewriter, op, loc),
+                                     idxTy, i64Ty);
+      rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
+    }
+    return success();
+  }
+};
+
+struct TsPutPattern : OpConversionPattern<TsPutOp> {
+  WTDescMap &descMap;
+
+  TsPutPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m)
+      : OpConversionPattern(tc, ctx), descMap(m) {}
+
+  LogicalResult
+  matchAndRewrite(TsPutOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto i64Ty   = rewriter.getI64Type();
+    auto idxTy   = rewriter.getIndexType();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+
+    Value tsPtr = adaptor.getTs();
+    Value v     = adaptor.getValue();
+    Type elemTy = v.getType();
+
+    auto it = descMap.find(op.getTs());
+    if (it != descMap.end()) {
+      // ── windowed_temp: store at buf[pos*stride], then advance pos ─
+      //   buf[pos * stride] = v
+      //   pos = (pos + 1 >= N) ? 0 : pos + 1
+      const WTDesc &desc = it->second;
+      int64_t N = static_cast<int64_t>(
+          llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
+      Value pos = rewriter.create<LLVM::LoadOp>(loc, i32Ty, desc.posPtr);
+
+      // Keep GEP index in i32 (cheap on GPU); LLVM accepts any int type.
+      Value gepIdx = applyStride(rewriter, loc, pos, desc.stride, i32Ty);
+      Value gep = rewriter.create<LLVM::GEPOp>(
+          loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
+      rewriter.create<LLVM::StoreOp>(loc, v, gep);
+
+      Value c1     = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value nCst   = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      Value zero32 = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(0));
+      Value posP1  = rewriter.create<LLVM::AddOp>(loc, pos, c1);
+      Value cmp    = rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::uge,
+                                                    posP1, nCst);
+      Value newPos = rewriter.create<LLVM::SelectOp>(loc, cmp, zero32, posP1);
+      rewriter.create<LLVM::StoreOp>(loc, newPos, desc.posPtr);
+      rewriter.eraseOp(op);
+    } else {
+      // ── global ts: write at current time ──────────────────────────
+      Value timeIdx = getCurrentTimeIdx(op);
+      Value gep = gmemGEPWithOffset(rewriter, loc, elemTy, ptrTy, tsPtr,
+                                     timeIdx, /*offsetIdx=*/Value(),
+                                     getNumStocksI64(rewriter, op, loc),
+                                     idxTy, i64Ty);
+      rewriter.create<LLVM::StoreOp>(loc, v, gep);
+      rewriter.eraseOp(op);
+    }
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+struct ConvertKunGpuToLLVMPass
+    : ::impl::ConvertKunGpuToLLVMBase<ConvertKunGpuToLLVMPass> {
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    auto *ctx       = &getContext();
+
+    // ── Phase 1 ────────────────────────────────────────────────────────
+    {
+      SmallVector<kunir::FuncOp> kfns;
+      module.walk([&](kunir::FuncOp fn) { kfns.push_back(fn); });
+      for (kunir::FuncOp fn : kfns)
+        convertFuncSignature(fn);
+    }
+
+    // ── Phase 2 ────────────────────────────────────────────────────────
+    TypeConverter typeConv;
+    typeConv.addConversion([](Type t) { return t; });
+    typeConv.addConversion([](TsType t) -> Type {
+      return LLVM::LLVMPointerType::get(t.getContext());
+    });
+    auto materialize = [](OpBuilder &b, Type t, ValueRange vs, Location l) -> Value {
+      if (vs.size() != 1) return Value();
+      return b.create<UnrealizedConversionCastOp>(l, t, vs).getResult(0);
+    };
+    typeConv.addSourceMaterialization(materialize);
+    typeConv.addTargetMaterialization(materialize);
+
+    ConversionTarget target(*ctx);
+    target.addLegalDialect<func::FuncDialect, arith::ArithDialect,
+                           scf::SCFDialect, LLVM::LLVMDialect,
+                           gpu::GPUDialect>();
+    target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
+    target.addIllegalOp<WindowedTempOp, TsGetOp, TsPutOp,
+                        TimeLengthOp, StockIdOp, BlockStockCountOp>();
+    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+      return typeConv.isSignatureLegal(op.getFunctionType()) &&
+             typeConv.isLegal(&op.getBody());
+    });
+    target.addDynamicallyLegalOp<func::ReturnOp>(
+        [&](func::ReturnOp op) { return typeConv.isLegal(op.getOperandTypes()); });
+
+    WTDescMap descMap;
+    int smemCounter = 0;
+
+    RewritePatternSet patterns(ctx);
+    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
+                                                                   typeConv);
+    populateReturnOpTypeConversionPattern(patterns, typeConv);
+    patterns.add<TimeLengthPattern, StockIdPattern, BlockStockCountPattern>(
+        typeConv, ctx);
+    patterns.add<WindowedTempPattern>(typeConv, ctx, descMap, smemCounter);
+    patterns.add<TsGetPattern, TsPutPattern>(typeConv, ctx, descMap);
+
+    if (failed(applyPartialConversion(module, target, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+namespace kungpu {
+std::unique_ptr<mlir::Pass> createConvertKunGpuToLLVMPass() {
+  return std::make_unique<ConvertKunGpuToLLVMPass>();
+}
+} // namespace kungpu
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index b101c34..35012aa 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -47,9 +47,13 @@ enum class TsKind { Handle, Scalar };
 struct TsEntry { TsKind kind; Value value; };
 using TsMap = llvm::DenseMap<Value, TsEntry>;
 
-// If `v` is mapped as a Handle in tsMap, emit ts.get(handle, timeIdx) and
+// If `v` is mapped as a Handle in tsMap, emit ts.get(handle, offsetI32) and
 // promote the entry to Scalar.  Returns the scalar value.
-static Value getScalar(Value v, TsMap &tsMap, Value timeIdx,
+//
+// `offsetI32` is the tail-relative offset (i32):
+//   0 = latest (just put / current time step)
+//   k = k steps earlier
+static Value getScalar(Value v, TsMap &tsMap, Value offsetI32,
                        OpBuilder &b, Location loc) {
   auto it = tsMap.find(v);
   assert(it != tsMap.end() && "value not found in tsMap");
@@ -57,7 +61,7 @@ static Value getScalar(Value v, TsMap &tsMap, Value timeIdx,
     return it->second.value;
   auto tsTy = llvm::cast<TsType>(v.getType());
   Value scalar = b.create<TsGetOp>(loc, tsTy.getElementType(),
-                                    it->second.value, timeIdx);
+                                    it->second.value, offsetI32);
   it->second = {TsKind::Scalar, scalar};
   return scalar;
 }
@@ -75,21 +79,21 @@ static Value getScalar(Value v, TsMap &tsMap, Value timeIdx,
 // Handle-typed operands are loaded via ts.get (getScalar) on first use.
 static LogicalResult lowerBlock(
     llvm::ArrayRef<Operation *> ops,
-    TsMap &tsMap, Value timeIdx, OpBuilder &b, Location loc,
+    TsMap &tsMap, Value offsetI32, OpBuilder &b, Location loc,
     llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
   for (Operation *op : ops) {
     Location ol = op->getLoc();
     if (auto iface = dyn_cast<BinaryArithInterface>(op)) {
-      Value lhs = getScalar(op->getOperand(0), tsMap, timeIdx, b, ol);
-      Value rhs = getScalar(op->getOperand(1), tsMap, timeIdx, b, ol);
+      Value lhs = getScalar(op->getOperand(0), tsMap, offsetI32, b, ol);
+      Value rhs = getScalar(op->getOperand(1), tsMap, offsetI32, b, ol);
       tsMap[op->getResult(0)] = {TsKind::Scalar,
           iface.buildScalarOp(b, ol, lhs, rhs)};
     } else if (auto iface = dyn_cast<UnaryArithInterface>(op)) {
-      Value operand = getScalar(op->getOperand(0), tsMap, timeIdx, b, ol);
+      Value operand = getScalar(op->getOperand(0), tsMap, offsetI32, b, ol);
       tsMap[op->getResult(0)] = {TsKind::Scalar,
           iface.buildScalarOp(b, ol, operand)};
     } else if (auto ri = dyn_cast<ReduceArithInterface>(op)) {
-      Value elem = getScalar(op->getOperand(0), tsMap, timeIdx, b, ol);
+      Value elem = getScalar(op->getOperand(0), tsMap, offsetI32, b, ol);
       auto it = tsMap.find(op->getResult(0));
       assert(it != tsMap.end() && it->second.kind == TsKind::Scalar
              && "reduce result must be pre-seeded in tsMap with current acc");
@@ -107,12 +111,12 @@ static LogicalResult lowerBlock(
 // Overload that collects non-terminator ops from `block` and delegates.
 static LogicalResult lowerBlock(
     Block &block,
-    TsMap &tsMap, Value timeIdx, OpBuilder &b, Location loc,
+    TsMap &tsMap, Value offsetI32, OpBuilder &b, Location loc,
     llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
   SmallVector<Operation *> ops;
   for (Operation &op : block.without_terminator())
     ops.push_back(&op);
-  return lowerBlock(ops, tsMap, timeIdx, b, loc, handleUnknown);
+  return lowerBlock(ops, tsMap, offsetI32, b, loc, handleUnknown);
 }
 
 //===----------------------------------------------------------------------===//
@@ -187,8 +191,12 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   Value timeLen = b.create<TimeLengthOp>(loc, b.getIndexType());
   Value c0 = b.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = b.create<arith::ConstantIndexOp>(loc, 1);
+  // Outer-loop ts.get/put always reference the current time step, i.e.
+  // tail-relative offset = 0 (i32).  Created before outerFor so it dominates
+  // every use inside the loop body.
+  Value zeroOffsetI32 = b.create<arith::ConstantOp>(
+      loc, b.getI32Type(), b.getI32IntegerAttr(0));
   auto outerFor = b.create<scf::ForOp>(loc, c0, timeLen, c1);
-  Value t = outerFor.getInductionVar();
 
   // Erase the implicit empty scf.yield (no iter_args → zero-operand yield).
   outerFor.getBody()->back().erase();
@@ -224,8 +232,9 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     if (auto woOp = dyn_cast<WindowedOutputOp>(op)) {
       auto wt = b.create<WindowedTempOp>(ol, woOp.getResult().getType());
       tsMap[woOp.getResult()] = {TsKind::Handle, wt.getResult()};
-      Value inputScalar = getScalar(woOp.getInput(), tsMap, t, fb, ol);
-      fb.create<TsPutOp>(ol, wt.getResult(), t, inputScalar);
+      Value inputScalar =
+          getScalar(woOp.getInput(), tsMap, zeroOffsetI32, fb, ol);
+      fb.create<TsPutOp>(ol, wt.getResult(), inputScalar);
       return success();
     }
 
@@ -266,17 +275,21 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       // Create inner scf.for %w = 0 to window step 1 iter_args(acc_i = init_i).
       // The lambda form lets us emit a proper scf.yield as the body terminator
       // without fighting the implicit yield created by ensureTerminator.
-      Value wBound = fb.create<arith::ConstantIndexOp>(ol, window);
-      Value wM1    = fb.create<arith::ConstantIndexOp>(ol, window - 1);
+      Value wBound  = fb.create<arith::ConstantIndexOp>(ol, window);
+      Value wM1_i32 = fb.create<arith::ConstantOp>(
+          ol, fb.getI32Type(), fb.getI32IntegerAttr(window - 1));
 
       // Capture lowerBlock result since the lambda can't return LogicalResult.
       bool innerOk = true;
       auto innerFor = fb.create<scf::ForOp>(
           ol, c0, wBound, c1, initVals,
           [&](OpBuilder &ib, Location il, Value w, ValueRange iterArgs) {
-            // elemIdx = t - (window - 1) + w
-            Value base    = ib.create<arith::SubIOp>(il, t, wM1);
-            Value elemIdx = ib.create<arith::AddIOp>(il, base, w);
+            // Tail-relative offset for this window step.  Iterating w from 0
+            // to window-1 reads oldest-to-newest, i.e. offset = window-1-w.
+            Value w_i32 =
+                ib.create<arith::IndexCastOp>(il, ib.getI32Type(), w);
+            Value offsetI32 =
+                ib.create<arith::SubIOp>(il, wM1_i32, w_i32);
 
             // Seed innerTsMap: block args as handles; reduce results as acc.
             TsMap innerTsMap;
@@ -286,7 +299,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
               innerTsMap[yv.getDefiningOp()->getResult(0)] = {TsKind::Scalar,
                                                               iterArgs[i]};
 
-            if (failed(lowerBlock(body, innerTsMap, elemIdx, ib, il))) {
+            if (failed(lowerBlock(body, innerTsMap, offsetI32, ib, il))) {
               innerOk = false;
               ib.create<scf::YieldOp>(il, initVals); // keep IR structurally valid
               return;
@@ -311,7 +324,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     return op.emitError("kunir-to-kungpu: unhandled op in outer block");
   };
 
-  if (failed(lowerBlock(origOps, tsMap, t, fb, loc, outerHandler)))
+  if (failed(lowerBlock(origOps, tsMap, zeroOffsetI32, fb, loc, outerHandler)))
     return signalPassFailure();
 
   // ------------------------------------------------------------------
@@ -320,7 +333,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   for (auto [outParam, rv] : llvm::zip(outParams, tsRetVals)) {
     auto it = tsMap.find(rv);
     assert(it != tsMap.end() && it->second.kind == TsKind::Scalar);
-    fb.create<TsPutOp>(loc, outParam, t, it->second.value);
+    fb.create<TsPutOp>(loc, outParam, it->second.value);
   }
   fb.create<scf::YieldOp>(loc);
 
diff --git a/mlir/test/kungpu/basic.mlir b/mlir/test/kungpu/basic.mlir
index 2490685..1099641 100644
--- a/mlir/test/kungpu/basic.mlir
+++ b/mlir/test/kungpu/basic.mlir
@@ -36,12 +36,12 @@ kunir.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32,
     inputs {%ts_in = "ts_in"}
     outputs {%ts_out = "ts_out"}
     target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} {
-  %c0 = arith.constant 0 : index
+  %off = arith.constant 0 : i32
   // CHECK: kungpu.ts.get
   // CHECK-SAME: <f32, inf> -> f32
-  %v = kungpu.ts.get %ts_in[%c0] : !kunir.ts<f32, inf> -> f32
+  %v = kungpu.ts.get %ts_in[%off] : !kunir.ts<f32, inf> -> f32
   // CHECK: kungpu.ts.put
-  kungpu.ts.put %ts_out[%c0], %v : !kunir.ts<f32, 1>, f32
+  kungpu.ts.put %ts_out, %v : !kunir.ts<f32, 1>, f32
   kunir.return
 }
 
@@ -50,10 +50,10 @@ kunir.func @test_windowed_temp()
     inputs {} outputs {"v"}
     target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
     -> f32 {
-  %c0 = arith.constant 0 : index
+  %off = arith.constant 0 : i32
   // CHECK: %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
   %wt = kungpu.windowed_temp : !kunir.ts<f32, 5>
   // CHECK: kungpu.ts.get %[[WT]]
-  %v = kungpu.ts.get %wt[%c0] : !kunir.ts<f32, 5> -> f32
+  %v = kungpu.ts.get %wt[%off] : !kunir.ts<f32, 5> -> f32
   kunir.return %v : f32
 }
diff --git a/mlir/test/kungpu/lower_to_llvm.mlir b/mlir/test/kungpu/lower_to_llvm.mlir
new file mode 100644
index 0000000..30db0e9
--- /dev/null
+++ b/mlir/test/kungpu/lower_to_llvm.mlir
@@ -0,0 +1,216 @@
+// RUN: %kun-opt --convert-kungpu-to-llvm %s | %FileCheck %s
+
+// =====================================================================
+// Smem global emitted by `test_windowed_smem` lands at module scope.
+// =====================================================================
+// CHECK:       llvm.mlir.global internal @[[SMEM:__smem_test_windowed_smem_[0-9]+]]()
+// CHECK-SAME:  {addr_space = 3 : i32}
+// CHECK-SAME:  !llvm.array<{{[0-9]+}} x f32>
+
+
+// =====================================================================
+// Case 1 — gmem-only: signature change, time_length lowering, TxS GEPs.
+// =====================================================================
+//
+// CHECK-LABEL: func.func @test_copy(
+// CHECK-SAME:    %[[TL:[^:]+]]: i32,
+// CHECK-SAME:    %[[NS:[^:]+]]: i32,
+// CHECK-SAME:    %[[IN:[^:]+]]: !llvm.ptr,
+// CHECK-SAME:    %[[OUT:[^:]+]]: !llvm.ptr
+// kunir-func metadata is preserved as discardable attributes on func.func.
+// CHECK-SAME:    kungpu.input_names = ["in"]
+// CHECK-SAME:    kungpu.output_names = ["out"]
+// CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
+//
+// time_length → arith.index_cast of arg0 (i32 → index)
+// CHECK:       %[[TLIDX:.*]] = arith.index_cast %[[TL]] : i32 to index
+// CHECK:       %[[OFFCST:.*]] = arith.constant 0 : i32
+//
+// CHECK:       scf.for %[[T:.*]] = %{{.*}} to %[[TLIDX]] step %{{.*}}
+//
+// ── ts.get on global %in at offset 0 ───────────────────────────────────
+// effective time = t − 0; stock_id = bid*bdim + tid; lin = effT*ns + sid.
+// num_stocks (i32 arg[1]) is sign-extended to i64 for the linear index.
+// CHECK:         %[[OFFI:.*]] = arith.index_cast %[[OFFCST]] : i32 to index
+// CHECK:         %[[NS64:.*]] = arith.extsi %[[NS]] : i32 to i64
+// CHECK:         %[[EFFT:.*]] = arith.subi %[[T]], %[[OFFI]] : index
+// CHECK:         %[[EFFT64:.*]] = arith.index_cast %[[EFFT]] : index to i64
+// CHECK:         %[[TID:.*]] = gpu.thread_id  x
+// CHECK:         %[[BID:.*]] = gpu.block_id   x
+// CHECK:         %[[BDIM:.*]] = gpu.block_dim  x
+// CHECK:         %[[BTB:.*]] = arith.muli %[[BID]], %[[BDIM]]
+// CHECK:         %[[SID:.*]] = arith.addi %[[BTB]], %[[TID]]
+// CHECK:         %[[SIDI:.*]] = arith.index_cast %[[SID]] : index to i64
+// CHECK:         %[[ROW:.*]] = arith.muli %[[EFFT64]], %[[NS64]] : i64
+// CHECK:         %[[LIN:.*]] = arith.addi %[[ROW]], %[[SIDI]] : i64
+// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[IN]][%[[LIN]]] {{.*}} -> !llvm.ptr, f32
+// CHECK:         %[[V:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> f32
+//
+// ── ts.put on global %out (no offset; writes at current iv) ───────────
+// CHECK:         %[[NS64B:.*]] = arith.extsi %[[NS]] : i32 to i64
+// CHECK:         %[[T64:.*]] = arith.index_cast %[[T]] : index to i64
+// CHECK:         %[[ROW2:.*]] = arith.muli %[[T64]], %[[NS64B]] : i64
+// CHECK:         %[[LIN2:.*]] = arith.addi %[[ROW2]],
+// CHECK:         %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
+// CHECK:         llvm.store %[[V]], %[[GEP2]]
+// CHECK:       return
+kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} {
+  %tl = kungpu.time_length
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %off = arith.constant 0 : i32
+  scf.for %t = %c0 to %tl step %c1 {
+    %v = kungpu.ts.get %in[%off] : !kunir.ts<f32, inf> -> f32
+    kungpu.ts.put %out, %v : !kunir.ts<f32, 1>, f32
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 2 — windowed_temp in local memory: alloca buffer + i32 pos cell,
+//          circular put/get (no modulo).
+// =====================================================================
+//
+// CHECK-LABEL: func.func @test_windowed_local
+// CHECK-SAME:  i32
+// CHECK-SAME:  i32
+// CHECK-SAME:  !llvm.ptr
+// CHECK-SAME:  !llvm.ptr
+//
+// ── windowed_temp lowering — buf alloca + 1×i32 pos cell init to 0 ────
+// CHECK:       %[[NCST:.*]] = llvm.mlir.constant(5 : i32) : i32
+// CHECK:       %[[BUF:.*]] = llvm.alloca %[[NCST]] x f32 : (i32) -> !llvm.ptr
+// CHECK:       %[[ONE32A:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       %[[POS:.*]] = llvm.alloca %[[ONE32A]] x i32 : (i32) -> !llvm.ptr
+// CHECK:       %[[ZERO32:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       llvm.store %[[ZERO32]], %[[POS]] : i32, !llvm.ptr
+//
+// CHECK:       scf.for %[[T:.*]] =
+//
+// ── ts.put %wt, %v (circular write):  buf[pos] = v; pos = (pos+1>=N)?0:pos+1
+// (GEP index is i32 — no sext, since LLVM accepts any int type for indices.)
+// CHECK:         %[[V:.*]] = llvm.load %{{.*}} : !llvm.ptr -> f32
+// CHECK:         %[[P:.*]] = llvm.load %[[POS]] : !llvm.ptr -> i32
+// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[BUF]][%[[P]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.store %[[V]], %[[GEP]] : f32, !llvm.ptr
+// CHECK:         %[[ONE32:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:         %[[N32:.*]] = llvm.mlir.constant(5 : i32) : i32
+// CHECK:         %[[Z32:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:         %[[PP1:.*]] = llvm.add %[[P]], %[[ONE32]] : i32
+// CHECK:         %[[CMP:.*]] = llvm.icmp "uge" %[[PP1]], %[[N32]] : i32
+// CHECK:         %[[NEW:.*]] = llvm.select %[[CMP]], %[[Z32]], %[[PP1]] : i1, i32
+// CHECK:         llvm.store %[[NEW]], %[[POS]] : i32, !llvm.ptr
+//
+// ── ts.get %wt[off] (circular read):
+//      adj=off+1; idx = pos>=adj ? pos-adj : pos+N-adj; return buf[idx]
+// CHECK:         %[[OF:.*]] = arith.index_cast %{{.*}} : index to i32
+// CHECK:         %[[P2:.*]] = llvm.load %[[POS]] : !llvm.ptr -> i32
+// CHECK:         %[[ONE32B:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:         %[[N32B:.*]] = llvm.mlir.constant(5 : i32) : i32
+// CHECK:         %[[ADJ:.*]] = llvm.add %[[OF]], %[[ONE32B]] : i32
+// CHECK:         %[[GE:.*]] = llvm.icmp "uge" %[[P2]], %[[ADJ]] : i32
+// CHECK:         %[[PMA:.*]] = llvm.sub %[[P2]], %[[ADJ]] : i32
+// CHECK:         %[[PPN:.*]] = llvm.add %[[P2]], %[[N32B]] : i32
+// CHECK:         %[[WR:.*]] = llvm.sub %[[PPN]], %[[ADJ]] : i32
+// CHECK:         %[[IDX:.*]] = llvm.select %[[GE]], %[[PMA]], %[[WR]] : i1, i32
+// CHECK:         %[[GGEP:.*]] = llvm.getelementptr %[[BUF]][%[[IDX]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.load %[[GGEP]] : !llvm.ptr -> f32
+kunir.func @test_windowed_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} {
+  %wt = kungpu.windowed_temp : !kunir.ts<f32, 5> {kungpu.smem = false}
+  %tl = kungpu.time_length
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %off0 = arith.constant 0 : i32
+  scf.for %t = %c0 to %tl step %c1 {
+    %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
+    kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
+    // dynamic offset (so we exercise the SSA pos→idx computation)
+    %off_idx = arith.subi %t, %c0 : index
+    %off_i32 = arith.index_cast %off_idx : index to i32
+    %w  = kungpu.ts.get %wt[%off_i32] : !kunir.ts<f32, 5> -> f32
+    kungpu.ts.put %out, %w : !kunir.ts<f32, 1>, f32
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 3 — windowed_temp in shared memory: slot-major layout.
+//
+//   layout:        smem[ slot*K + tid ]   (K = threads_per_block)
+//   global size:   N * K elements
+//   per-thread base:  bufPtr = smem + tid           (1 GEP at allocation)
+//   per-access stride: K  (smem[bufPtr + idx*K] in ts.put / ts.get)
+// =====================================================================
+//
+// Global has 5*128 = 640 elements (N=5, warps_per_cta=4 → K=128).
+// (already checked at the top: !llvm.array<{{[0-9]+}} x f32>)
+//
+// CHECK-LABEL: func.func @test_windowed_smem
+// CHECK:       %[[RAW:.*]] = llvm.mlir.addressof @[[SMEM]] : !llvm.ptr<3>
+// CHECK:       %[[GEN:.*]] = llvm.addrspacecast %[[RAW]] : !llvm.ptr<3> to !llvm.ptr
+// CHECK:       %[[TID:.*]] = gpu.thread_id  x
+// CHECK:       %[[TIDI:.*]] = arith.index_cast %[[TID]] : index to i32
+// bufPtr = smem + tid  (no per-allocation N multiply)
+// CHECK:       %[[BUF3:.*]] = llvm.getelementptr %[[GEN]][%[[TIDI]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// pos cell still alloca'd (i32)
+// CHECK:       llvm.alloca {{.*}} x i32
+//
+// Inside the loop — ts.put: stride-K multiply before the GEP.
+// CHECK:         %[[POSV:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
+// CHECK:         %[[K:.*]] = llvm.mlir.constant(128 : i32) : i32
+// CHECK:         %[[OFFP:.*]] = llvm.mul %[[POSV]], %[[K]] : i32
+// CHECK:         %[[GEPP:.*]] = llvm.getelementptr %[[BUF3]][%[[OFFP]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.store %{{.*}}, %[[GEPP]]
+//
+// ts.get: same stride-K pattern.
+// CHECK:         %[[K2:.*]] = llvm.mlir.constant(128 : i32) : i32
+// CHECK:         %[[OFFG:.*]] = llvm.mul %{{.*}}, %[[K2]] : i32
+// CHECK:         %[[GEPG:.*]] = llvm.getelementptr %[[BUF3]][%[[OFFG]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.load %[[GEPG]]
+kunir.func @test_windowed_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} {
+  %wt = kungpu.windowed_temp : !kunir.ts<f32, 5> {kungpu.smem = true}
+  %tl = kungpu.time_length
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %off0 = arith.constant 0 : i32
+  scf.for %t = %c0 to %tl step %c1 {
+    %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
+    kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
+    %w  = kungpu.ts.get %wt[%off0] : !kunir.ts<f32, 5> -> f32
+    kungpu.ts.put %out, %w : !kunir.ts<f32, 1>, f32
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 4 — stock_id and block_stock_count lowering.
+// =====================================================================
+//
+// CHECK-LABEL: func.func @test_indexing
+// CHECK:       gpu.thread_id  x
+// CHECK-NEXT:  gpu.block_id   x
+// CHECK-NEXT:  gpu.block_dim  x
+// CHECK-NEXT:  arith.muli
+// CHECK-NEXT:  arith.addi
+// CHECK:       gpu.block_dim  x
+kunir.func @test_indexing(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} {
+  %sid = kungpu.stock_id
+  %bsc = kungpu.block_stock_count
+  %sum = arith.addi %sid, %bsc : index
+  kunir.return
+}
diff --git a/mlir/test/kunir/lower_to_kungpu.mlir b/mlir/test/kunir/lower_to_kungpu.mlir
index e9241a1..fe6a7e9 100644
--- a/mlir/test/kunir/lower_to_kungpu.mlir
+++ b/mlir/test/kunir/lower_to_kungpu.mlir
@@ -13,11 +13,14 @@ kunir.func @test_binary_lower(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
   // CHECK:      %[[TL:.*]] = kungpu.time_length
   // CHECK:      %[[C0:.*]] = arith.constant 0 : index
   // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // outer-loop offset = 0 (i32) used by every gmem ts.get/put
+  // CHECK:      %[[OFF:.*]] = arith.constant 0 : i32
   // CHECK:      scf.for %{{.*}} = %[[C0]] to %[[TL]] step %[[C1]]
-  // CHECK:        kungpu.ts.get
-  // CHECK:        kungpu.ts.get
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF]]]
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF]]]
   // CHECK:        arith.addf
   // CHECK:        kungpu.ts.put
+  // CHECK-NOT:    kungpu.ts.put %{{.*}}[
   %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
   kunir.return %sum : !kunir.ts<f32, 1>
 }
@@ -41,13 +44,19 @@ kunir.func @test_windowed_sum(%close: !kunir.ts<f32, inf>)
     -> !kunir.ts<f32, 1> {
   // CHECK:      %[[C0:.*]] = arith.constant 0 : index
   // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // CHECK:      %[[OFF0:.*]] = arith.constant 0 : i32
   // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
   // CHECK:      scf.for %[[T:.*]] =
-  // CHECK:        kungpu.ts.get %{{.*}}[%[[T]]]
-  // CHECK:        kungpu.ts.put %[[WT]][%[[T]]]
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF0]]]
+  // outer-loop ts.put has no offset operand
+  // CHECK:        kungpu.ts.put %[[WT]], %{{[^[]+}} : <f32, 5>, f32
   // CHECK:        %[[WIN:.*]] = arith.constant 5 : index
-  // CHECK:        scf.for %{{.*}} = %[[C0]] to %[[WIN]] step %[[C1]] iter_args
-  // CHECK:          kungpu.ts.get %[[WT]]
+  // window-loop offset = (window-1) - w  (oldest first)
+  // CHECK:        %[[WM1:.*]] = arith.constant 4 : i32
+  // CHECK:        scf.for %[[W:.*]] = %[[C0]] to %[[WIN]] step %[[C1]] iter_args
+  // CHECK:          %[[WI:.*]] = arith.index_cast %[[W]] : index to i32
+  // CHECK:          %[[OFFW:.*]] = arith.subi %[[WM1]], %[[WI]] : i32
+  // CHECK:          kungpu.ts.get %[[WT]][%[[OFFW]]]
   // CHECK:          arith.addf
   %w = kunir.windowed_output %close [length = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 5>
   %sum = kunir.for_each_back_window
@@ -70,8 +79,8 @@ kunir.func @test_computed_reduce(%x: !kunir.ts<f32, inf>, %y: !kunir.ts<f32, inf
   // CHECK:      %[[WY:.*]] = kungpu.windowed_temp : <f32, 3>
   // CHECK:      scf.for
   // CHECK:        scf.for {{.*}} iter_args
-  // CHECK:          %[[A:.*]] = kungpu.ts.get %[[WX]]
-  // CHECK:          %[[B:.*]] = kungpu.ts.get %[[WY]]
+  // CHECK:          %[[A:.*]] = kungpu.ts.get %[[WX]][%{{.*}}]
+  // CHECK:          %[[B:.*]] = kungpu.ts.get %[[WY]][%{{.*}}]
   // CHECK:          %[[P:.*]] = arith.mulf %[[A]], %[[B]]
   // CHECK:          arith.addf {{.*}}, %[[P]]
   %wx = kunir.windowed_output %x [length = 3] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 3>
@@ -96,17 +105,17 @@ kunir.func @test_multi_reduce(%input: !kunir.ts<f64, inf>)
     -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
   // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f64, 10>
   // CHECK:      scf.for %[[T:.*]] =
-  // CHECK:        kungpu.ts.get %[[IN]][%[[T]]]
-  // CHECK:        kungpu.ts.put %[[WT]][%[[T]]]
+  // CHECK:        kungpu.ts.get %[[IN]][%{{.*}}]
+  // CHECK:        kungpu.ts.put %[[WT]], %{{[^[]+}} : <f64, 10>, f64
   // CHECK:        %[[CST0:.*]] = arith.constant 0.0{{.*}} : f64
   // CHECK:        %[[NEGINF:.*]] = arith.constant 0xFFF0000000000000 : f64
   // CHECK:        %[[R:.*]]:2 = scf.for {{.*}} iter_args(%{{.*}} = %[[CST0]], %{{.*}} = %[[NEGINF]]) -> (f64, f64)
-  // CHECK:          kungpu.ts.get %[[WT]]
+  // CHECK:          kungpu.ts.get %[[WT]][%{{.*}}]
   // CHECK:          arith.addf
   // CHECK:          arith.maximumf
   // CHECK:          scf.yield {{.*}}, {{.*}} : f64, f64
-  // CHECK:        kungpu.ts.put %[[OUT0]][%[[T]]], %[[R]]#0 : <f64, 1>, f64
-  // CHECK:        kungpu.ts.put %[[OUT1]][%[[T]]], %[[R]]#1 : <f64, 1>, f64
+  // CHECK:        kungpu.ts.put %[[OUT0]], %[[R]]#0 : <f64, 1>, f64
+  // CHECK:        kungpu.ts.put %[[OUT1]], %[[R]]#1 : <f64, 1>, f64
   %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
   %sum, %max = kunir.for_each_back_window
       (%w : !kunir.ts<f64, 10>) [window = 10]

From 7e6245810743fb9033dc760769007ad700447eb7 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 6 May 2026 21:32:18 -0700
Subject: [PATCH 06/59] pipeline to llvm

---
 mlir/Tools/kun-opt/CMakeLists.txt            |  12 ++
 mlir/Tools/kun-opt/kun-opt.cpp               |  22 +++-
 mlir/include/KunGpu/KunGpuUtils.h            |  38 +++---
 mlir/include/KunGpu/Pipelines.h              |  47 +++++++
 mlir/include/KunIr/Passes.h                  |   5 +
 mlir/lib/KunGpu/CMakeLists.txt               |   9 ++
 mlir/lib/KunGpu/KunGpuToLLVM.cpp             |  58 +++++----
 mlir/lib/KunGpu/Pipelines.cpp                | 122 +++++++++++++++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp             |   3 +
 mlir/test/kungpu/kunir_to_llvm_pipeline.mlir |  60 +++++++++
 mlir/test/kungpu/lower_to_llvm.mlir          |  32 +++--
 11 files changed, 348 insertions(+), 60 deletions(-)
 create mode 100644 mlir/include/KunGpu/Pipelines.h
 create mode 100644 mlir/lib/KunGpu/Pipelines.cpp
 create mode 100644 mlir/test/kungpu/kunir_to_llvm_pipeline.mlir

diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
index b35c5ac..b3c8eee 100644
--- a/mlir/Tools/kun-opt/CMakeLists.txt
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -21,6 +21,18 @@ target_link_libraries(kun-opt PRIVATE
   MLIRSCFDialect
   MLIRGPUDialect
   MLIRLLVMDialect
+  MLIRControlFlowDialect
+  MLIRIndexDialect
+
+  # Conversion passes used by the kunir-to-llvm pipeline
+  MLIRSCFToControlFlow
+  MLIRControlFlowToLLVM
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRIndexToLLVM
+  MLIRGPUToNVVMTransforms
+  MLIRNVVMDialect
+  MLIRReconcileUnrealizedCasts
 
   # Core MLIR libraries
   MLIRIR
diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
index 2181951..1e49143 100644
--- a/mlir/Tools/kun-opt/kun-opt.cpp
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -1,15 +1,19 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Conversion/Passes.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "KunGpu/KunGpuDialect.h"
 #include "KunGpu/Passes.h"
+#include "KunGpu/Pipelines.h"
 #include "KunIr/KunIrDialect.h"
 #include "KunIr/KunIrOps.h"
 #include "KunIr/Passes.h"
@@ -24,14 +28,30 @@ int main(int argc, char **argv) {
   registry.insert<mlir::scf::SCFDialect>();
   registry.insert<mlir::gpu::GPUDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::cf::ControlFlowDialect>();
+  registry.insert<mlir::index::IndexDialect>();
 
   // KunQuant dialects
   registry.insert<kunir::KunIrDialect>();
   registry.insert<kungpu::KunGpuDialect>();
 
-  // KunQuant passes
+  // KunQuant passes & pipelines
   kunir::registerKunIrToKunGpuPass();
   kungpu::registerKunGpuPasses();
+  kungpu::registerKunIrToLLVMPass();
+
+  // Upstream passes used by the kunir-to-llvm pipeline (also lets users
+  // build the pipeline manually via --pass-pipeline=… for debugging).
+  mlir::registerCanonicalizerPass();
+  mlir::registerCSEPass();
+  mlir::registerLoopInvariantCodeMotionPass();
+  mlir::registerSCFToControlFlowPass();
+  mlir::registerConvertControlFlowToLLVMPass();
+  mlir::registerArithToLLVMConversionPass();
+  mlir::registerConvertIndexToLLVMPass();
+  mlir::registerConvertFuncToLLVMPass();
+  mlir::registerConvertGpuOpsToNVVMOpsPass();
+  mlir::registerReconcileUnrealizedCastsPass();
 
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "KunQuant MLIR optimizer\n", registry));
diff --git a/mlir/include/KunGpu/KunGpuUtils.h b/mlir/include/KunGpu/KunGpuUtils.h
index 8258df7..46eb6c8 100644
--- a/mlir/include/KunGpu/KunGpuUtils.h
+++ b/mlir/include/KunGpu/KunGpuUtils.h
@@ -1,58 +1,48 @@
 //===- KunGpuUtils.h - Lookup helpers for kungpu metadata on func ops ----===//
 //
-// After convert-kungpu-to-llvm phase 1 lowers `kunir.func` to `func.func`,
-// the original kunir.func metadata (target spec, input/output names) is
-// preserved as discardable attributes on the new func.func.  Use these
-// accessors instead of reading attributes by name in callers — they are
-// the func.func equivalents of `kunir::FuncOp::getTargetSpec` etc.
+// After convert-kungpu-to-llvm lowers `kunir.func` to `gpu.func`, the
+// original kunir.func metadata (target spec, input/output names) is
+// preserved as discardable attributes on the new gpu.func.  Accessors take
+// `Operation*` so they also work on whatever the gpu.func is later
+// rewritten to (e.g. `llvm.func` after convert-gpu-to-nvvm).
 //
 //===----------------------------------------------------------------------===//
 
 #pragma once
 
 #include "KunIr/KunIrAttrs.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace kungpu {
 
-/// Discardable-attribute names used to attach kunir.func metadata to a
-/// func.func after phase 1 of convert-kungpu-to-llvm.
+/// Discardable-attribute names used to attach kunir.func metadata to the
+/// kernel function after phase 1 of convert-kungpu-to-llvm.
 constexpr llvm::StringLiteral kFuncTargetSpecAttr  = "kungpu.target_spec";
 constexpr llvm::StringLiteral kFuncInputNamesAttr  = "kungpu.input_names";
 constexpr llvm::StringLiteral kFuncOutputNamesAttr = "kungpu.output_names";
 
-/// Read the target_spec attribute from a func.func (lowered from a
-/// kunir.func).  Returns null if the attribute is missing.
-inline ::kunir::TargetSpecAttr
-getFuncTargetSpec(::mlir::func::FuncOp fn) {
+inline ::kunir::TargetSpecAttr getFuncTargetSpec(::mlir::Operation *fn) {
   return fn->getAttrOfType<::kunir::TargetSpecAttr>(kFuncTargetSpecAttr);
 }
-inline void setFuncTargetSpec(::mlir::func::FuncOp fn,
+inline void setFuncTargetSpec(::mlir::Operation *fn,
                                 ::kunir::TargetSpecAttr spec) {
   fn->setAttr(kFuncTargetSpecAttr, spec);
 }
 
-/// Read the input_names array attribute from a func.func.  The array
-/// contains one StringAttr per ts input parameter; null if missing.
-inline ::mlir::ArrayAttr
-getFuncInputNames(::mlir::func::FuncOp fn) {
+inline ::mlir::ArrayAttr getFuncInputNames(::mlir::Operation *fn) {
   return fn->getAttrOfType<::mlir::ArrayAttr>(kFuncInputNamesAttr);
 }
-inline void setFuncInputNames(::mlir::func::FuncOp fn,
+inline void setFuncInputNames(::mlir::Operation *fn,
                                 ::mlir::ArrayAttr names) {
   fn->setAttr(kFuncInputNamesAttr, names);
 }
 
-/// Read the output_names array attribute from a func.func.  The array
-/// contains one StringAttr per ts output (function arg in void form, or
-/// result in non-void form); null if missing.
-inline ::mlir::ArrayAttr
-getFuncOutputNames(::mlir::func::FuncOp fn) {
+inline ::mlir::ArrayAttr getFuncOutputNames(::mlir::Operation *fn) {
   return fn->getAttrOfType<::mlir::ArrayAttr>(kFuncOutputNamesAttr);
 }
-inline void setFuncOutputNames(::mlir::func::FuncOp fn,
+inline void setFuncOutputNames(::mlir::Operation *fn,
                                  ::mlir::ArrayAttr names) {
   fn->setAttr(kFuncOutputNamesAttr, names);
 }
diff --git a/mlir/include/KunGpu/Pipelines.h b/mlir/include/KunGpu/Pipelines.h
new file mode 100644
index 0000000..aaff937
--- /dev/null
+++ b/mlir/include/KunGpu/Pipelines.h
@@ -0,0 +1,47 @@
+//===- Pipelines.h - Reusable kunir → LLVM lowering pipeline -------------===//
+//
+// Defines the canonical lowering pipeline that converts a `kunir.func`-based
+// module all the way down to the LLVM dialect.  Phase ordering:
+//
+//   1. kunir-to-kungpu                    (kunir.func nested)
+//   2. kungpu-memory-planning             (kunir.func nested)
+//   3. convert-kungpu-to-llvm             (module — also lowers kunir.func
+//                                          to func.func)
+//   4. loop-invariant-code-motion         (per func)
+//   5. canonicalize
+//   6. cse
+//   7. convert-scf-to-cf
+//   8. convert-control-flow-to-llvm
+//   9. convert-arith-to-llvm
+//  10. convert-index-to-llvm
+//  11. convert-func-to-llvm
+//  12. reconcile-unrealized-casts
+//
+// `kunir_to_ptx` will reuse `buildKunIrToLLVMPipeline` and append the
+// gpu→nvvm/llvm-translation passes after it.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+class OpPassManager;
+} // namespace mlir
+
+namespace kungpu {
+
+/// Append the kunir → LLVM dialect lowering passes to `pm`.  This is the
+/// shared entry point used by both the test wrapper pass below and by any
+/// downstream pipeline that needs to lower further (e.g. kunir_to_ptx).
+void buildKunIrToLLVMPipeline(::mlir::OpPassManager &pm);
+
+/// Single-pass wrapper that runs `buildKunIrToLLVMPipeline` on the current
+/// module.  Mainly for lit-testing the pipeline as a whole.
+std::unique_ptr<::mlir::Pass> createKunIrToLLVMPass();
+
+void registerKunIrToLLVMPass();
+
+} // namespace kungpu
diff --git a/mlir/include/KunIr/Passes.h b/mlir/include/KunIr/Passes.h
index 773292c..6bc4c3e 100644
--- a/mlir/include/KunIr/Passes.h
+++ b/mlir/include/KunIr/Passes.h
@@ -1,5 +1,10 @@
 #pragma once
 
+#include <memory>
+
+namespace mlir { class Pass; }
+
 namespace kunir {
 void registerKunIrToKunGpuPass();
+std::unique_ptr<::mlir::Pass> createKunIrToKunGpuPass();
 } // namespace kunir
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index db9c05b..cd39c26 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   KunGpuOps.cpp
   KunGpuMemoryPlanning.cpp
   KunGpuToLLVM.cpp
+  Pipelines.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/mlir/include
@@ -20,7 +21,15 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRArithDialect
   MLIRGPUDialect
   MLIRLLVMDialect
+  MLIRTransforms
   MLIRTransformUtils
   MLIRSideEffectInterfaces
   MLIRKunIrDialect
+  MLIRSCFToControlFlow
+  MLIRControlFlowToLLVM
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRIndexToLLVM
+  MLIRGPUToNVVMTransforms
+  MLIRNVVMDialect
 )
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 54072f4..59a7400 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -1,16 +1,19 @@
-//===- KunGpuToLLVM.cpp - Lower kungpu + kunir.func → func + LLVM ---------===//
+//===- KunGpuToLLVM.cpp - Lower kungpu + kunir.func → gpu.func + LLVM ---===//
 //
-// Two-phase pass.
+// Assumes the input module is a `gpu.module` (or that the kunir.func lives
+// inside one).  Two-phase pass.
 //
 // Phase 1 (convertFuncSignature, simple imperative helper):
 //   kunir.func @f(%a: !kunir.ts<…>, …)
-//     → func.func @f(%t: i32, %n: i32, %a: !kunir.ts<…>, …)
+//     → gpu.func @f(%t: i32, %n: i32, %a: !kunir.ts<…>, …) kernel
+//   inserted into the same gpu.module that contained the kunir.func.
 //   The two prepended i32 arguments are time_length and num_stocks
 //   (i32 because 64-bit ops are slow on GPUs; the linear gmem address
-//   is still computed in i64).  ts arg types are preserved.
+//   is still computed in i64).  ts arg types are preserved here — phase 2
+//   converts them to !llvm.ptr via the standard signature-conversion pat.
 //   target_spec, input_names and output_names are moved to discardable
 //   attributes (see KunGpuUtils.h accessors).
-//   kunir.return → func.return.
+//   kunir.return → gpu.return.
 //
 // Phase 2 (applyPartialConversion, one OpConversionPattern per op):
 //   TypeConverter:  !kunir.ts<T,N> → !llvm.ptr
@@ -51,6 +54,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -95,14 +99,22 @@ static void convertFuncSignature(kunir::FuncOp fn) {
   for (Type t : oldFT.getInputs())
     newArgTypes.push_back(t);
 
+  // Build gpu.func right before the kunir.func — both live inside the
+  // enclosing gpu.module.
   OpBuilder b(fn);
-  auto newFunc = b.create<func::FuncOp>(
+  auto newFunc = b.create<gpu::GPUFuncOp>(
       loc, fn.getSymName(), FunctionType::get(ctx, newArgTypes, {}));
-  newFunc.setVisibility(SymbolTable::Visibility::Public);
+  // Mark as a kernel (sets the op-level `kernel` attribute) so that
+  // convert-gpu-to-nvvm tags the resulting llvm.func with `nvvm.kernel`.
+  newFunc.setKernelAttr(UnitAttr::get(ctx));
   setFuncTargetSpec (newFunc, fn.getTargetSpecAttr());
   setFuncInputNames (newFunc, fn.getInputNames());
   setFuncOutputNames(newFunc, fn.getOutputNames());
 
+  // gpu.func's auto-created entry block is replaced with the kunir.func
+  // body.  Block-arg types initially still match the kunir.func signature;
+  // phase 2's signature-conversion pattern reconciles them with the new
+  // gpu.func type (ts → !llvm.ptr).
   newFunc.getBody().takeBody(fn.getBody());
   Block &entry = newFunc.getBody().front();
   entry.insertArgument(0u, i32Ty, loc);
@@ -112,7 +124,7 @@ static void convertFuncSignature(kunir::FuncOp fn) {
   newFunc.walk([&](kunir::ReturnOp r) { returns.push_back(r); });
   for (kunir::ReturnOp r : returns) {
     OpBuilder rb(r);
-    rb.create<func::ReturnOp>(r.getLoc());
+    rb.create<gpu::ReturnOp>(r.getLoc());
     r.erase();
   }
   fn.erase();
@@ -134,7 +146,7 @@ static Value emitStockId(OpBuilder &b, Location loc, Type idxTy) {
 // address computation.  The bare i32 value is in arg[1]; we extend at every
 // use site (cheap, and lets the caller decide).
 static Value getNumStocksI64(OpBuilder &b, Operation *op, Location loc) {
-  Value ns32 = op->getParentOfType<func::FuncOp>()
+  Value ns32 = op->getParentOfType<gpu::GPUFuncOp>()
                    .getBody().front().getArgument(1);
   return b.create<arith::ExtSIOp>(loc, b.getI64Type(), ns32);
 }
@@ -167,7 +179,7 @@ struct TimeLengthPattern : OpConversionPattern<TimeLengthOp> {
   LogicalResult
   matchAndRewrite(TimeLengthOp op, OpAdaptor /*a*/,
                   ConversionPatternRewriter &rewriter) const override {
-    Value tl32 = op->getParentOfType<func::FuncOp>()
+    Value tl32 = op->getParentOfType<gpu::GPUFuncOp>()
                      .getBody().front().getArgument(0);
     rewriter.replaceOpWithNewOp<arith::IndexCastOp>(
         op, rewriter.getIndexType(), tl32);
@@ -236,18 +248,18 @@ struct WindowedTempPattern : OpConversionPattern<WindowedTempOp> {
     int64_t stride;
 
     if (op.isSmem()) {
-      auto fn = op->getParentOfType<func::FuncOp>();
-      auto module = op->getParentOfType<ModuleOp>();
+      auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+      auto gpuModule = op->getParentOfType<gpu::GPUModuleOp>();
       auto tsAttr = getFuncTargetSpec(fn);
       int64_t blockSize = tsAttr ? (tsAttr.getWarpsPerCta() * 32) : 32;
       stride = blockSize;
 
       std::string name =
-          ("__smem_" + fn.getSymName() + "_" +
+          ("__smem_" + fn.getName() + "_" +
            llvm::Twine(smemCounter++)).str();
       {
         OpBuilder::InsertionGuard g(rewriter);
-        Block *modBody = module.getBody();
+        Block *modBody = &gpuModule.getBodyRegion().front();
         rewriter.setInsertionPoint(modBody, modBody->begin());
         rewriter.create<LLVM::GlobalOp>(
             loc, LLVM::LLVMArrayType::get(elemTy, N * blockSize), false,
@@ -454,26 +466,26 @@ struct ConvertKunGpuToLLVMPass
     typeConv.addTargetMaterialization(materialize);
 
     ConversionTarget target(*ctx);
-    target.addLegalDialect<func::FuncDialect, arith::ArithDialect,
-                           scf::SCFDialect, LLVM::LLVMDialect,
-                           gpu::GPUDialect>();
+    target.addLegalDialect<arith::ArithDialect, scf::SCFDialect,
+                           LLVM::LLVMDialect, gpu::GPUDialect>();
     target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
     target.addIllegalOp<WindowedTempOp, TsGetOp, TsPutOp,
                         TimeLengthOp, StockIdOp, BlockStockCountOp>();
-    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+    // gpu.func is legal only after its signature has been converted from
+    // (...kunir.ts) to (...!llvm.ptr) by the FunctionOpInterface pattern
+    // we register below.
+    target.addDynamicallyLegalOp<gpu::GPUFuncOp>([&](gpu::GPUFuncOp op) {
       return typeConv.isSignatureLegal(op.getFunctionType()) &&
              typeConv.isLegal(&op.getBody());
     });
-    target.addDynamicallyLegalOp<func::ReturnOp>(
-        [&](func::ReturnOp op) { return typeConv.isLegal(op.getOperandTypes()); });
+    // gpu.return is void in our IR — always legal.
 
     WTDescMap descMap;
     int smemCounter = 0;
 
     RewritePatternSet patterns(ctx);
-    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
-                                                                   typeConv);
-    populateReturnOpTypeConversionPattern(patterns, typeConv);
+    populateFunctionOpInterfaceTypeConversionPattern<gpu::GPUFuncOp>(
+        patterns, typeConv);
     patterns.add<TimeLengthPattern, StockIdPattern, BlockStockCountPattern>(
         typeConv, ctx);
     patterns.add<WindowedTempPattern>(typeConv, ctx, descMap, smemCounter);
diff --git a/mlir/lib/KunGpu/Pipelines.cpp b/mlir/lib/KunGpu/Pipelines.cpp
new file mode 100644
index 0000000..63acdb1
--- /dev/null
+++ b/mlir/lib/KunGpu/Pipelines.cpp
@@ -0,0 +1,122 @@
+//===- Pipelines.cpp - kunir → LLVM lowering pipeline --------------------===//
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/Passes.h"
+#include "KunGpu/Pipelines.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/Passes.h"
+
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace kungpu {
+
+void buildKunIrToLLVMPipeline(OpPassManager &pm) {
+  // pm's anchor is builtin.module; the kernels live one level down inside
+  // a gpu.module, so kunir.func / gpu.func passes must nest through it.
+
+  // ── 1–2.  Per-kunir.func passes (nested: gpu.module → kunir.func) ────
+  {
+    OpPassManager &gpuModPM = pm.nest<gpu::GPUModuleOp>();
+    gpuModPM.addNestedPass<::kunir::FuncOp>(::kunir::createKunIrToKunGpuPass());
+    gpuModPM.addNestedPass<::kunir::FuncOp>(
+        ::kungpu::createWindowedTempMemoryPlanningPass());
+  }
+
+  // ── 3.  kunir.func → gpu.func + kungpu ops → LLVM (module-level) ─────
+  pm.addPass(::kungpu::createConvertKunGpuToLLVMPass());
+
+  // ── 4.  LICM per gpu.func (nested: gpu.module → gpu.func) ────────────
+  {
+    OpPassManager &gpuModPM = pm.nest<gpu::GPUModuleOp>();
+    gpuModPM.addNestedPass<gpu::GPUFuncOp>(createLoopInvariantCodeMotionPass());
+  }
+
+  // ── 5–6.  Generic cleanup ─────────────────────────────────────────────
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  // ── 7.  scf → cf (control flow) ───────────────────────────────────────
+  pm.addPass(createSCFToControlFlowPass());
+
+  // ── 8.  index / arith / cf → LLVM, in order.  These lower the device-
+  //       side body of gpu.func before gpu-to-nvvm, so the latter only
+  //       has to deal with gpu ops + the gpu.func wrapper.
+  pm.addPass(createConvertIndexToLLVMPass());
+  pm.addPass(createArithToLLVMConversionPass());
+  pm.addPass(createConvertControlFlowToLLVMPass());
+
+  // ── 9.  gpu.thread_id / block_id / block_dim → nvvm intrinsics, plus
+  //       gpu.func → llvm.func (with `nvvm.kernel`).
+  // indexBitwidth = 32 matches our function-signature i32 (no spurious
+  // sext/trunc around the i32 NVVM intrinsics).
+  {
+    ConvertGpuOpsToNVVMOpsOptions gpuOpts;
+    gpuOpts.indexBitwidth = 32;
+    pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps(gpuOpts));
+  }
+
+  // ── 10.  func.func → llvm.func (host-side helpers, if any).
+  pm.addPass(createConvertFuncToLLVMPass());
+
+  // ── 11.  Resolve any leftover unrealized_conversion_casts ──────────
+  pm.addPass(createReconcileUnrealizedCastsPass());
+}
+
+namespace {
+
+// Lit-test wrapper: runs the whole pipeline as a single -kunir-to-llvm pass.
+struct KunIrToLLVMPass
+    : PassWrapper<KunIrToLLVMPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(KunIrToLLVMPass)
+  StringRef getArgument()    const override { return "kunir-to-llvm"; }
+  StringRef getDescription() const override {
+    return "Lower kunir.func down to the LLVM dialect (test wrapper)";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    // Pulls in everything the nested pipeline will create / load.
+    registry.insert<::kungpu::KunGpuDialect, scf::SCFDialect,
+                    arith::ArithDialect, math::MathDialect,
+                    func::FuncDialect, gpu::GPUDialect,
+                    LLVM::LLVMDialect, NVVM::NVVMDialect,
+                    cf::ControlFlowDialect, index::IndexDialect>();
+  }
+
+  void runOnOperation() override {
+    OpPassManager pm("builtin.module");
+    buildKunIrToLLVMPipeline(pm);
+    if (failed(runPipeline(pm, getOperation())))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createKunIrToLLVMPass() {
+  return std::make_unique<KunIrToLLVMPass>();
+}
+
+void registerKunIrToLLVMPass() {
+  PassRegistration<KunIrToLLVMPass>();
+}
+
+} // namespace kungpu
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 35012aa..0e7316c 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -363,4 +363,7 @@ namespace kunir {
 void registerKunIrToKunGpuPass() {
   PassRegistration<LowerKunIrToKunGpuPass>();
 }
+std::unique_ptr<mlir::Pass> createKunIrToKunGpuPass() {
+  return std::make_unique<LowerKunIrToKunGpuPass>();
+}
 } // namespace kunir
diff --git a/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir b/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
new file mode 100644
index 0000000..9620fe3
--- /dev/null
+++ b/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
@@ -0,0 +1,60 @@
+// RUN: %kun-opt --kunir-to-llvm %s | %FileCheck %s
+//
+// End-to-end smoke test for the kunir-to-llvm pipeline:
+//   kunir-to-kungpu → memory-planning → convert-kungpu-to-llvm
+//   → LICM → canonicalize → cse → scf-to-cf
+//   → convert-gpu-to-nvvm (indexBitwidth=32)
+//   → index/arith/cf/func to-llvm → reconcile-unrealized-casts.
+//
+// We verify that no kunir/kungpu/scf/gpu *ops* survive in the function
+// body and that the original kunir.func becomes an llvm.func with
+// nvvm.kernel tagging, inside the same gpu.module.
+
+// CHECK-NOT: kunir.{{[a-z_.]+ }}
+// CHECK-NOT: kungpu.{{[a-z_.]+ }}
+// CHECK-NOT: scf.{{[a-z_]+}}
+// CHECK-NOT: gpu.{{[a-z_]+ }}
+
+// CHECK:       gpu.module @kungpu_kernels
+
+// llvm.func with the (i32 time_len, i32 num_stocks, ptr...) signature,
+// tagged as a kernel by convert-gpu-to-nvvm.
+// CHECK-LABEL: llvm.func @test_addsum
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    !llvm.ptr
+// CHECK-SAME:    !llvm.ptr
+// CHECK-SAME:    !llvm.ptr
+//
+// kunir-func metadata preserved as discardable attributes:
+// CHECK-SAME:    kungpu.input_names = ["a", "b"]
+// CHECK-SAME:    kungpu.output_names = ["sum"]
+// CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
+// CHECK-SAME:    nvvm.kernel
+
+// gpu.thread_id / block_id / block_dim are now NVVM intrinsics.
+// CHECK:       nvvm.read.ptx.sreg.tid.x
+// CHECK:       nvvm.read.ptx.sreg.ctaid.x
+// CHECK:       nvvm.read.ptx.sreg.ntid.x
+
+// Branch-based control flow from scf-to-cf:
+// CHECK-DAG:   llvm.br
+// CHECK-DAG:   llvm.cond_br
+
+// Lowered arithmetic + load/store from gmem:
+// CHECK-DAG:   llvm.fadd
+// CHECK-DAG:   llvm.getelementptr
+// CHECK-DAG:   llvm.load
+// CHECK-DAG:   llvm.store
+// CHECK:       llvm.return
+
+gpu.module @kungpu_kernels {
+  kunir.func @test_addsum(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"sum"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+}
diff --git a/mlir/test/kungpu/lower_to_llvm.mlir b/mlir/test/kungpu/lower_to_llvm.mlir
index 30db0e9..2b28772 100644
--- a/mlir/test/kungpu/lower_to_llvm.mlir
+++ b/mlir/test/kungpu/lower_to_llvm.mlir
@@ -1,23 +1,31 @@
 // RUN: %kun-opt --convert-kungpu-to-llvm %s | %FileCheck %s
+//
+// All kernels live in a single gpu.module — convert-kungpu-to-llvm rewrites
+// each kunir.func to a gpu.func (kernel) inside that gpu.module, with the
+// signature prepended by (i32 time_len, i32 num_stocks).
+
+gpu.module @kungpu_kernels {
 
 // =====================================================================
-// Smem global emitted by `test_windowed_smem` lands at module scope.
+// Smem global emitted by `test_windowed_smem` lands inside gpu.module.
 // =====================================================================
-// CHECK:       llvm.mlir.global internal @[[SMEM:__smem_test_windowed_smem_[0-9]+]]()
-// CHECK-SAME:  {addr_space = 3 : i32}
-// CHECK-SAME:  !llvm.array<{{[0-9]+}} x f32>
+// CHECK:       gpu.module @kungpu_kernels {
+// CHECK:         llvm.mlir.global internal @[[SMEM:__smem_test_windowed_smem_[0-9]+]]()
+// CHECK-SAME:    {addr_space = 3 : i32}
+// CHECK-SAME:    !llvm.array<{{[0-9]+}} x f32>
 
 
 // =====================================================================
 // Case 1 — gmem-only: signature change, time_length lowering, TxS GEPs.
 // =====================================================================
 //
-// CHECK-LABEL: func.func @test_copy(
+// CHECK-LABEL: gpu.func @test_copy(
 // CHECK-SAME:    %[[TL:[^:]+]]: i32,
 // CHECK-SAME:    %[[NS:[^:]+]]: i32,
 // CHECK-SAME:    %[[IN:[^:]+]]: !llvm.ptr,
 // CHECK-SAME:    %[[OUT:[^:]+]]: !llvm.ptr
-// kunir-func metadata is preserved as discardable attributes on func.func.
+// kernel attribute is set, kunir-func metadata preserved as discardables:
+// CHECK-SAME:    kernel
 // CHECK-SAME:    kungpu.input_names = ["in"]
 // CHECK-SAME:    kungpu.output_names = ["out"]
 // CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
@@ -53,7 +61,7 @@
 // CHECK:         %[[LIN2:.*]] = arith.addi %[[ROW2]],
 // CHECK:         %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
 // CHECK:         llvm.store %[[V]], %[[GEP2]]
-// CHECK:       return
+// CHECK:       gpu.return
 kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
@@ -75,7 +83,7 @@ kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 //          circular put/get (no modulo).
 // =====================================================================
 //
-// CHECK-LABEL: func.func @test_windowed_local
+// CHECK-LABEL: gpu.func @test_windowed_local
 // CHECK-SAME:  i32
 // CHECK-SAME:  i32
 // CHECK-SAME:  !llvm.ptr
@@ -131,7 +139,6 @@ kunir.func @test_windowed_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1
   scf.for %t = %c0 to %tl step %c1 {
     %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
     kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
-    // dynamic offset (so we exercise the SSA pos→idx computation)
     %off_idx = arith.subi %t, %c0 : index
     %off_i32 = arith.index_cast %off_idx : index to i32
     %w  = kungpu.ts.get %wt[%off_i32] : !kunir.ts<f32, 5> -> f32
@@ -151,9 +158,8 @@ kunir.func @test_windowed_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1
 // =====================================================================
 //
 // Global has 5*128 = 640 elements (N=5, warps_per_cta=4 → K=128).
-// (already checked at the top: !llvm.array<{{[0-9]+}} x f32>)
 //
-// CHECK-LABEL: func.func @test_windowed_smem
+// CHECK-LABEL: gpu.func @test_windowed_smem
 // CHECK:       %[[RAW:.*]] = llvm.mlir.addressof @[[SMEM]] : !llvm.ptr<3>
 // CHECK:       %[[GEN:.*]] = llvm.addrspacecast %[[RAW]] : !llvm.ptr<3> to !llvm.ptr
 // CHECK:       %[[TID:.*]] = gpu.thread_id  x
@@ -198,7 +204,7 @@ kunir.func @test_windowed_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>
 // Case 4 — stock_id and block_stock_count lowering.
 // =====================================================================
 //
-// CHECK-LABEL: func.func @test_indexing
+// CHECK-LABEL: gpu.func @test_indexing
 // CHECK:       gpu.thread_id  x
 // CHECK-NEXT:  gpu.block_id   x
 // CHECK-NEXT:  gpu.block_dim  x
@@ -214,3 +220,5 @@ kunir.func @test_indexing(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
   %sum = arith.addi %sid, %bsc : index
   kunir.return
 }
+
+}  // gpu.module

From e4271d8122386e2c6c9325d51dd1e3ea1a6b5298 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 6 May 2026 22:54:41 -0700
Subject: [PATCH 07/59] ptx backend

---
 mlir/Tools/kun-opt/CMakeLists.txt |   6 ++
 mlir/Tools/kun-opt/kun-opt.cpp    |   1 +
 mlir/include/KunGpu/PtxBackend.h  |  59 +++++++++++
 mlir/lib/KunGpu/CMakeLists.txt    |  28 +++++
 mlir/lib/KunGpu/Pipelines.cpp     |  14 +--
 mlir/lib/KunGpu/PtxBackend.cpp    | 167 ++++++++++++++++++++++++++++++
 6 files changed, 268 insertions(+), 7 deletions(-)
 create mode 100644 mlir/include/KunGpu/PtxBackend.h
 create mode 100644 mlir/lib/KunGpu/PtxBackend.cpp

diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
index b3c8eee..fc2a2ca 100644
--- a/mlir/Tools/kun-opt/CMakeLists.txt
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -1,4 +1,9 @@
 set(LLVM_LINK_COMPONENTS Support)
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  list(APPEND LLVM_LINK_COMPONENTS
+    NVPTXCodeGen NVPTXAsmParser NVPTXDesc NVPTXInfo
+    Passes Target TargetParser Core IRReader CodeGen MC AsmPrinter)
+endif()
 
 add_llvm_executable(kun-opt kun-opt.cpp)
 
@@ -32,6 +37,7 @@ target_link_libraries(kun-opt PRIVATE
   MLIRIndexToLLVM
   MLIRGPUToNVVMTransforms
   MLIRNVVMDialect
+  MLIRConvertToLLVMPass
   MLIRReconcileUnrealizedCasts
 
   # Core MLIR libraries
diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
index 1e49143..5117474 100644
--- a/mlir/Tools/kun-opt/kun-opt.cpp
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -51,6 +51,7 @@ int main(int argc, char **argv) {
   mlir::registerConvertIndexToLLVMPass();
   mlir::registerConvertFuncToLLVMPass();
   mlir::registerConvertGpuOpsToNVVMOpsPass();
+  mlir::registerConvertToLLVMPass();
   mlir::registerReconcileUnrealizedCastsPass();
 
   return mlir::asMainReturnCode(
diff --git a/mlir/include/KunGpu/PtxBackend.h b/mlir/include/KunGpu/PtxBackend.h
new file mode 100644
index 0000000..c446ae8
--- /dev/null
+++ b/mlir/include/KunGpu/PtxBackend.h
@@ -0,0 +1,59 @@
+//===- PtxBackend.h - Compile a kunir module all the way to PTX ---------===//
+//
+// Companion to `Pipelines.h` — runs the kunir-to-llvm dialect pipeline,
+// translates the resulting MLIR `gpu.module` to an `llvm::Module`, applies
+// the standard LLVM optimization pipeline (PassBuilder default
+// per-module pipeline, the same one mlir::makeOptimizingTransformer uses,
+// which is what upstream `gpu-module-to-binary` invokes via
+// `ModuleToObject::optimizeModule`), and finally emits PTX through
+// `NVPTXTargetMachine::addPassesToEmitFile(AssemblyFile)`.
+//
+// This is the single C++ entry point downstream `kunir_to_ptx` callers
+// (host runtime, JIT) should plumb to.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+#include <string>
+
+namespace kungpu {
+
+struct PtxCompileOptions {
+  /// LLVM optimization level (0..3, mapped to OptimizationLevel::O0..O3).
+  unsigned optLevel = 3;
+
+  /// LLVM size level (0..2). 0 disables size opts; rarely needed for GPU.
+  unsigned sizeLevel = 0;
+
+  /// SM target, e.g. "sm_80".  Defaults to a widely-supported value; the
+  /// caller should set it to whatever GPU it actually targets.
+  std::string targetTriple = "nvptx64-nvidia-cuda";
+  std::string targetCpu    = "sm_80";
+  std::string targetFeatures = "+ptx80";
+};
+
+/// End-to-end compile a `builtin.module` containing `gpu.module` kernels.
+///
+/// 1. Runs the kunir → LLVM dialect pipeline (see Pipelines.h).
+/// 2. Translates the LLVM-dialect module to llvm::Module via upstream
+///    `mlir::translateModuleToLLVMIR`.
+/// 3. Runs LLVM optimizations: `PassBuilder::buildPerModuleDefaultPipeline`
+///    at the chosen OptimizationLevel — this includes DCE, InstCombine,
+///    CSE, LICM, vectorization, etc.  The TargetMachine is the
+///    NVPTXTargetMachine for the requested SM, so target-specific
+///    pipeline tweaks fire too.
+/// 4. Emits PTX assembly via `TargetMachine::addPassesToEmitFile` with
+///    `CodeGenFileType::AssemblyFile`.
+///
+/// On success, `ptxOut` contains the PTX text.  On failure, returns
+/// `failure()` after reporting diagnostics through MLIR's standard
+/// channels.
+::mlir::LogicalResult compileKunIrToPtx(::mlir::ModuleOp module,
+                                          const PtxCompileOptions &options,
+                                          std::string &ptxOut);
+
+} // namespace kungpu
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index cd39c26..ebfe78b 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -1,9 +1,32 @@
+# PtxBackend.cpp pulls in the NVPTX LLVM backend; it compiles to an
+# empty TU and we don't add the NVPTX-only deps when LLVM was configured
+# without NVPTX in LLVM_TARGETS_TO_BUILD.
+set(_KUN_HAS_NVPTX OFF)
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(_KUN_HAS_NVPTX ON)
+endif()
+
+set(_KUN_KUNGPU_EXTRA_LIBS )
+if(_KUN_HAS_NVPTX)
+  list(APPEND _KUN_KUNGPU_EXTRA_LIBS
+    MLIRTargetLLVMIRExport
+    MLIRBuiltinToLLVMIRTranslation
+    MLIRLLVMToLLVMIRTranslation
+    MLIRNVVMToLLVMIRTranslation
+    MLIRExecutionEngineUtils
+  )
+else()
+  message(STATUS "[KunGpu] NVPTX target not in LLVM_TARGETS_TO_BUILD; "
+                  "kungpu::compileKunIrToPtx will be a stub.")
+endif()
+
 add_mlir_dialect_library(MLIRKunGpuDialect
   KunGpuDialect.cpp
   KunGpuOps.cpp
   KunGpuMemoryPlanning.cpp
   KunGpuToLLVM.cpp
   Pipelines.cpp
+  PtxBackend.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/mlir/include
@@ -32,4 +55,9 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRIndexToLLVM
   MLIRGPUToNVVMTransforms
   MLIRNVVMDialect
+  ${_KUN_KUNGPU_EXTRA_LIBS}
 )
+
+if(_KUN_HAS_NVPTX)
+  target_compile_definitions(obj.MLIRKunGpuDialect PRIVATE KUN_HAS_NVPTX=1)
+endif()
diff --git a/mlir/lib/KunGpu/Pipelines.cpp b/mlir/lib/KunGpu/Pipelines.cpp
index 63acdb1..6a87db7 100644
--- a/mlir/lib/KunGpu/Pipelines.cpp
+++ b/mlir/lib/KunGpu/Pipelines.cpp
@@ -67,13 +67,13 @@ void buildKunIrToLLVMPipeline(OpPassManager &pm) {
 
   // ── 9.  gpu.thread_id / block_id / block_dim → nvvm intrinsics, plus
   //       gpu.func → llvm.func (with `nvvm.kernel`).
-  // indexBitwidth = 32 matches our function-signature i32 (no spurious
-  // sext/trunc around the i32 NVVM intrinsics).
-  {
-    ConvertGpuOpsToNVVMOpsOptions gpuOpts;
-    gpuOpts.indexBitwidth = 32;
-    pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps(gpuOpts));
-  }
+  // We deliberately match the default index bitwidth (64) used by the
+  // earlier arith/cf/index→LLVM passes — mixing 32 and 64 leaves
+  // i32 → index → i64 unrealized_conversion_cast chains that
+  // reconcile-unrealized-casts can't fold.  The downside is a single
+  // sext after each NVVM intrinsic, which LLVM's later DCE/InstCombine
+  // erases.
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
 
   // ── 10.  func.func → llvm.func (host-side helpers, if any).
   pm.addPass(createConvertFuncToLLVMPass());
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
new file mode 100644
index 0000000..db9b212
--- /dev/null
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -0,0 +1,167 @@
+//===- PtxBackend.cpp - Compile a kunir module all the way to PTX ------===//
+
+#include "KunGpu/PtxBackend.h"
+#include "KunGpu/Pipelines.h"
+
+#ifndef KUN_HAS_NVPTX
+
+// LLVM was built without the NVPTX target.  Provide a stub so callers
+// still link, but compiling actual PTX is unavailable.
+
+#include "mlir/IR/BuiltinOps.h"
+
+namespace kungpu {
+::mlir::LogicalResult compileKunIrToPtx(::mlir::ModuleOp module,
+                                          const PtxCompileOptions &,
+                                          std::string &) {
+  return module.emitError(
+      "compileKunIrToPtx: NVPTX target was not enabled in this LLVM build "
+      "(missing 'NVPTX' in LLVM_TARGETS_TO_BUILD).");
+}
+} // namespace kungpu
+
+#else  // KUN_HAS_NVPTX
+
+
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Dialect/All.h"
+#include "mlir/Target/LLVMIR/Export.h"
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Host.h"
+
+using namespace mlir;
+
+namespace kungpu {
+
+namespace {
+
+/// Look up the LLVM target for the given triple, lazily initializing the
+/// NVPTX target & asmprinter once per process.
+static llvm::Expected<const llvm::Target *>
+lookupNvptxTarget(llvm::StringRef triple) {
+  static const bool kInit = [] {
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+    return true;
+  }();
+  (void)kInit;
+
+  std::string err;
+  const llvm::Target *t = llvm::TargetRegistry::lookupTarget(triple.str(), err);
+  if (!t)
+    return llvm::createStringError(llvm::inconvertibleErrorCode(), err);
+  return t;
+}
+
+} // namespace
+
+LogicalResult compileKunIrToPtx(ModuleOp module,
+                                 const PtxCompileOptions &options,
+                                 std::string &ptxOut) {
+  MLIRContext *ctx = module.getContext();
+
+  // ─── Step 1.  Run the kunir → LLVM dialect pipeline ────────────────
+  PassManager pm(ctx);
+  buildKunIrToLLVMPipeline(pm);
+  if (failed(pm.run(module)))
+    return module.emitError(
+        "compileKunIrToPtx: kunir-to-llvm pipeline failed");
+
+  // ─── Step 2.  Translate MLIR LLVM dialect → llvm::Module ──────────
+  // Make sure NVVM (and friends) know how to emit themselves to LLVM IR.
+  DialectRegistry registry;
+  registerAllToLLVMIRTranslations(registry);
+  ctx->appendDialectRegistry(registry);
+
+  llvm::LLVMContext llvmCtx;
+  std::unique_ptr<llvm::Module> llvmModule =
+      translateModuleToLLVMIR(module, llvmCtx);
+  if (!llvmModule)
+    return module.emitError(
+        "compileKunIrToPtx: translation to LLVM IR failed");
+
+  // ─── Step 3.  Build NVPTXTargetMachine ────────────────────────────
+  auto targetOrErr = lookupNvptxTarget(options.targetTriple);
+  if (!targetOrErr) {
+    llvm::handleAllErrors(targetOrErr.takeError(),
+                          [&](const llvm::ErrorInfoBase &eib) {
+                            module.emitError(
+                                "compileKunIrToPtx: NVPTX target lookup: ")
+                                << eib.message();
+                          });
+    return failure();
+  }
+  llvm::TargetOptions opts;
+  std::unique_ptr<llvm::TargetMachine> targetMachine{
+      (*targetOrErr)
+          ->createTargetMachine(llvm::Triple(options.targetTriple),
+                                options.targetCpu, options.targetFeatures,
+                                opts, /*RelocModel=*/std::nullopt,
+                                /*CodeModel=*/std::nullopt,
+                                llvm::CodeGenOptLevel::Aggressive)};
+  if (!targetMachine)
+    return module.emitError(
+        "compileKunIrToPtx: failed to create NVPTXTargetMachine");
+
+  llvmModule->setTargetTriple(llvm::Triple(options.targetTriple));
+  llvmModule->setDataLayout(targetMachine->createDataLayout());
+
+  // ─── Step 4.  Run LLVM PassBuilder default pipeline ───────────────
+  // This is the same entry point upstream `gpu-module-to-binary` uses
+  // (see ModuleToObject::optimizeModule → makeOptimizingTransformer).
+  // It builds the full new-PM per-module pipeline at the requested O level,
+  // which includes mem2reg, SROA, GVN, LICM, instcombine, DCE, vectorise,
+  // unroll, etc., plus NVPTX-specific tweaks (the TargetMachine is passed
+  // to PassBuilder so its pipeline-tuning hooks fire).
+  if (auto err = makeOptimizingTransformer(options.optLevel,
+                                            options.sizeLevel,
+                                            targetMachine.get())(
+          llvmModule.get())) {
+    llvm::handleAllErrors(std::move(err),
+                          [&](const llvm::ErrorInfoBase &eib) {
+                            module.emitError(
+                                "compileKunIrToPtx: LLVM opt pipeline: ")
+                                << eib.message();
+                          });
+    return failure();
+  }
+
+  // ─── Step 5.  Emit PTX (AssemblyFile) via legacy codegen pipeline ─
+  // This is the standard path used by `mlir::ModuleToObject`: the legacy
+  // PassManager is required because `addPassesToEmitFile` is a legacy
+  // codegen API.  The new PM ran in step 4 — codegen still uses legacy.
+  llvm::SmallString<0> ptxBuf;
+  {
+    llvm::raw_svector_ostream stream(ptxBuf);
+    llvm::buffer_ostream bufStream(stream);
+    llvm::legacy::PassManager codegenPM;
+    if (targetMachine->addPassesToEmitFile(
+            codegenPM, bufStream, /*DwoOut=*/nullptr,
+            llvm::CodeGenFileType::AssemblyFile)) {
+      return module.emitError(
+          "compileKunIrToPtx: NVPTXTargetMachine cannot emit assembly");
+    }
+    codegenPM.run(*llvmModule);
+  }
+  ptxOut.assign(ptxBuf.begin(), ptxBuf.end());
+  return success();
+}
+
+} // namespace kungpu
+
+#endif // KUN_HAS_NVPTX

From cbd4517733d4662dce9e854ec64d7725d621f7e3 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 7 May 2026 01:01:01 -0700
Subject: [PATCH 08/59] JIT

---
 .gitignore                             |   1 +
 mlir/CMakeLists.txt                    |  10 +
 mlir/Tools/kun-opt/CMakeLists.txt      |   2 +-
 mlir/include/KunCuda/Runtime.h         |  88 +++++++
 mlir/include/KunGpu/PtxBackend.h       |  46 +++-
 mlir/lib/CMakeLists.txt                |   2 +
 mlir/lib/KunCuda/CMakeLists.txt        |  37 +++
 mlir/lib/KunCuda/Runtime.cpp           | 139 +++++++++++
 mlir/lib/KunGpu/CMakeLists.txt         |  35 +--
 mlir/lib/KunGpu/PtxBackend.cpp         | 212 ++++++++++++++--
 mlir/lib/Python/CMakeLists.txt         |  56 +++++
 mlir/lib/Python/MlirBinding.cpp        | 326 +++++++++++++++++++++++++
 mlir/test/python/test_kun_mlir.py      | 109 +++++++++
 mlir/test/python/test_windowed_temp.py | 152 ++++++++++++
 14 files changed, 1167 insertions(+), 48 deletions(-)
 create mode 100644 mlir/include/KunCuda/Runtime.h
 create mode 100644 mlir/lib/KunCuda/CMakeLists.txt
 create mode 100644 mlir/lib/KunCuda/Runtime.cpp
 create mode 100644 mlir/lib/Python/CMakeLists.txt
 create mode 100644 mlir/lib/Python/MlirBinding.cpp
 create mode 100644 mlir/test/python/test_kun_mlir.py
 create mode 100644 mlir/test/python/test_windowed_temp.py

diff --git a/.gitignore b/.gitignore
index d9c82ef..06b9740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .vscode/*
+.claude/*
 *.pyc
 build/*
 tests/cpp/generated/*
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 5d7c7ea..b3a1598 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -10,6 +10,16 @@ include_directories(${KUN_MLIR_BINARY_DIR}/include)
 
 add_definitions(${LLVM_DEFINITIONS})
 
+# The MLIR backend requires LLVM to have been built with the NVPTX target
+# — we emit PTX and load cubins.  Fail loudly here so the diagnostic is
+# clear rather than getting cryptic linker errors later.
+if(NOT "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  message(FATAL_ERROR
+    "KunQuant MLIR backend requires LLVM with NVPTX target.  Reconfigure "
+    "your LLVM build with -DLLVM_TARGETS_TO_BUILD=\"X86;NVPTX\" "
+    "(currently: '${LLVM_TARGETS_TO_BUILD}').")
+endif()
+
 # Enable gc-sections so kun-opt pulls in only the code it actually uses
 # from MLIR static libraries, keeping the binary small.
 if(NOT MSVC)
diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
index fc2a2ca..9adc9de 100644
--- a/mlir/Tools/kun-opt/CMakeLists.txt
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(LLVM_LINK_COMPONENTS Support)
 if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
   list(APPEND LLVM_LINK_COMPONENTS
-    NVPTXCodeGen NVPTXAsmParser NVPTXDesc NVPTXInfo
+    NVPTXCodeGen NVPTXDesc NVPTXInfo
     Passes Target TargetParser Core IRReader CodeGen MC AsmPrinter)
 endif()
 
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
new file mode 100644
index 0000000..9d6f1c4
--- /dev/null
+++ b/mlir/include/KunCuda/Runtime.h
@@ -0,0 +1,88 @@
+//===- Runtime.h - kun_cuda runtime: ExecutableData + Executable -------===//
+//
+// Pure runtime piece, decoupled from the MLIR compiler and the Python
+// binding.  The compiler produces an `ExecutableData` (cubin + metadata),
+// the runtime turns that into a loaded `Executable` (cuModuleLoadData +
+// cuModuleGetFunction) and launches it.
+//
+// This header forward-declares the two opaque CUDA Driver types it
+// stores by pointer (CUmodule / CUfunction) so consumers don't need to
+// pull in <cuda.h>.  These typedefs match cuda.h's verbatim — they have
+// been ABI-stable for two decades.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+extern "C" {
+typedef struct CUmod_st  *CUmodule;
+typedef struct CUfunc_st *CUfunction;
+} // extern "C"
+
+namespace kun_cuda {
+
+/// Everything needed to load + launch a compiled kunir kernel.  No CUDA
+/// types — kept POD so the compiler library can populate it without
+/// depending on cuda.h.
+struct ExecutableData {
+  std::vector<char> cubin;             ///< raw cubin bytes (ELF)
+  std::string kernelName;              ///< symbol name in the cubin
+  std::vector<std::string> inputNames; ///< from kungpu.input_names
+  std::vector<std::string> outputNames;///< from kungpu.output_names
+  int64_t warpsPerCta = 1;             ///< from kungpu.target_spec
+  int64_t vectorSize  = 1;             ///< stocks-per-thread, from target_spec
+};
+
+/// RAII wrapper around a loaded cubin + resolved kernel function.
+/// Construction calls `cuModuleLoadData` + `cuModuleGetFunction` on the
+/// CUDA primary context of the calling thread (which must already exist).
+/// Destruction calls `cuModuleUnload`.
+class Executable {
+public:
+  /// Throws std::runtime_error on driver errors or missing CUDA context.
+  /// Takes an rvalue — caller `std::move`s the data in.
+  explicit Executable(ExecutableData &&data);
+  ~Executable();
+
+  // Non-copyable, non-movable — wrap in unique_ptr / shared_ptr if you
+  // need transferable ownership.
+  Executable(const Executable &)            = delete;
+  Executable &operator=(const Executable &) = delete;
+  Executable(Executable &&)                 = delete;
+  Executable &operator=(Executable &&)      = delete;
+
+  const ExecutableData &data() const noexcept { return data_; }
+  const std::string &kernelName() const noexcept { return data_.kernelName; }
+  const std::vector<std::string> &inputNames()  const noexcept { return data_.inputNames; }
+  const std::vector<std::string> &outputNames() const noexcept { return data_.outputNames; }
+  int64_t warpsPerCta() const noexcept { return data_.warpsPerCta; }
+  int64_t vectorSize()  const noexcept { return data_.vectorSize; }
+
+  /// Launch the kernel.  `timeLength` / `numStocks` describe the kernel
+  /// invocation as a whole — the caller is responsible for verifying all
+  /// device buffers are sized accordingly (TS layout: `(t, s)` at
+  /// `ptr + (t*numStocks + s) * sizeof(T)`).
+  ///
+  /// `args` keys must equal `inputNames ++ outputNames` (order doesn't
+  /// matter, names are looked up).  Grid configuration:
+  ///
+  ///   block_x = warps_per_cta * 32
+  ///   grid_x  = ceil_div(numStocks, block_x * vector_size)
+  ///
+  /// Synchronous on the default stream.  Throws std::runtime_error on
+  /// validation or driver errors.
+  void launch(int64_t timeLength, int64_t numStocks,
+              const std::vector<std::pair<std::string, uintptr_t>> &args);
+
+private:
+  ExecutableData data_;
+  CUmodule cuModule_ = nullptr;
+  CUfunction cuFunc_ = nullptr;
+};
+
+} // namespace kun_cuda
diff --git a/mlir/include/KunGpu/PtxBackend.h b/mlir/include/KunGpu/PtxBackend.h
index c446ae8..4b26bea 100644
--- a/mlir/include/KunGpu/PtxBackend.h
+++ b/mlir/include/KunGpu/PtxBackend.h
@@ -15,10 +15,13 @@
 
 #pragma once
 
+#include "KunCuda/Runtime.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
 
 #include <string>
+#include <vector>
 
 namespace kungpu {
 
@@ -33,7 +36,9 @@ struct PtxCompileOptions {
   /// caller should set it to whatever GPU it actually targets.
   std::string targetTriple = "nvptx64-nvidia-cuda";
   std::string targetCpu    = "sm_80";
-  std::string targetFeatures = "+ptx80";
+  /// Empty by default — let LLVM pick a PTX version compatible with the
+  /// chosen `targetCpu` (sm_80 → ptx70 etc., sm_120 → ptx87 etc.).
+  std::string targetFeatures;
 };
 
 /// End-to-end compile a `builtin.module` containing `gpu.module` kernels.
@@ -56,4 +61,43 @@ ::mlir::LogicalResult compileKunIrToPtx(::mlir::ModuleOp module,
                                           const PtxCompileOptions &options,
                                           std::string &ptxOut);
 
+struct PtxToCubinOptions {
+  /// SM architecture to assemble for, e.g. "sm_80".
+  std::string gpuArch = "sm_80";
+  /// PTX ISA version for ptxas (passed via --gpu-name and -V).  Empty =
+  /// let ptxas choose its default.
+  std::string ptxasVersion;
+  /// Extra arguments forwarded verbatim to ptxas (e.g. {"-O3"}).
+  std::vector<std::string> extraArgs;
+  /// Optional override for the ptxas binary path.  When empty we search
+  /// PATH and the CUDA_HOME / CUDA_PATH env vars (same logic upstream
+  /// NVPTXSerializer uses).
+  std::string ptxasPath;
+};
+
+/// Assemble PTX text into a CUBIN binary.  This is the same operation
+/// upstream `NVPTXSerializer::compileToBinary` performs internally —
+/// shell out to `ptxas` — exposed as a standalone helper because the
+/// upstream class isn't part of the public C++ API.
+///
+/// On success, `cubinOut` contains the raw CUBIN bytes.
+::mlir::LogicalResult compilePtxToCubin(::llvm::StringRef ptx,
+                                          const PtxToCubinOptions &options,
+                                          std::vector<char> &cubinOut,
+                                          std::string &errorMsg);
+
+/// All-in-one: run the kunir → LLVM dialect pipeline, translate to LLVM
+/// IR, optimize, emit PTX, assemble to CUBIN, and pull the kernel
+/// metadata (name + I/O argument names + target-spec fields) off the
+/// lowered function so callers can hand the result to
+/// `kun_cuda::Executable` without re-walking the IR.
+///
+/// The module is mutated in-place by the pipeline (same as
+/// `compileKunIrToPtx`).
+::mlir::LogicalResult
+compileKunIrToExecutable(::mlir::ModuleOp module,
+                          const PtxCompileOptions &ptxOpts,
+                          const PtxToCubinOptions &cubinOpts,
+                          ::kun_cuda::ExecutableData &out);
+
 } // namespace kungpu
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
index 66b4ff9..f2c6f50 100644
--- a/mlir/lib/CMakeLists.txt
+++ b/mlir/lib/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_subdirectory(KunIr)
+add_subdirectory(KunCuda)
 add_subdirectory(KunGpu)
+add_subdirectory(Python)
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
new file mode 100644
index 0000000..7e1b995
--- /dev/null
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -0,0 +1,37 @@
+# libKunCudaRuntime.so — pure CUDA runtime, decoupled from the MLIR
+# compiler library and the Python binding.  Holds the ExecutableData
+# struct, the Executable class (cuModuleLoadData / cuLaunchKernel) and
+# nothing else.
+
+add_library(KunCudaRuntime SHARED Runtime.cpp)
+
+# Project-wide compile flags set -fvisibility=hidden + inlines-hidden to
+# minimise the size of MLIR static libs.  This shared runtime needs to
+# export its public class methods so downstream .so's (kun_mlir, host
+# runners, …) can resolve them at load time.
+#
+# We also put the .so next to the kun_mlir Python module, mirroring the
+# existing project pattern (INSTALL_RPATH=$ORIGIN at the top level): all
+# co-distributed shared libs live in one directory and find each other
+# as siblings.
+set_target_properties(KunCudaRuntime PROPERTIES
+    CXX_VISIBILITY_PRESET default
+    VISIBILITY_INLINES_HIDDEN OFF
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/mlir/python")
+
+target_include_directories(KunCudaRuntime PUBLIC
+    "${PROJECT_SOURCE_DIR}/mlir/include")
+
+# CUDA Driver API (cuda.h + libcuda stub).  Resolved via CMake's
+# standard FindCUDAToolkit — pass -DCUDAToolkit_ROOT=<path> or set
+# $CUDA_PATH if the toolkit is not in a default search location.
+# The stub is only used at link time; the runtime loader resolves
+# libcuda.so.1 from the NVIDIA driver.  This dep is PRIVATE:
+# downstream consumers see only Runtime.h, which never includes
+# <cuda.h>.
+find_package(CUDAToolkit REQUIRED)
+target_link_libraries(KunCudaRuntime PRIVATE CUDA::cuda_driver)
+
+# Like the Python module, this shared library has no CPython symbols to
+# satisfy the global -Wl,-z,defs check; but we *do* link the CUDA stub
+# so all libcuda symbols *are* resolved.  Nothing to strip here.
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
new file mode 100644
index 0000000..961e058
--- /dev/null
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -0,0 +1,139 @@
+//===- Runtime.cpp - kun_cuda::Executable implementation ---------------===//
+
+#include "KunCuda/Runtime.h"
+
+#include <cuda.h>
+
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+
+namespace kun_cuda {
+
+namespace {
+
+void checkCu(CUresult r, const char *what) {
+  if (r == CUDA_SUCCESS)
+    return;
+  const char *s = nullptr;
+  cuGetErrorString(r, &s);
+  throw std::runtime_error(std::string(what) + ": " +
+                            (s ? s : "unknown CUDA error"));
+}
+
+std::string joinNames(const std::vector<std::string> &v) {
+  std::string r;
+  for (size_t i = 0; i < v.size(); ++i) {
+    if (i)
+      r += ", ";
+    r += v[i];
+  }
+  return r;
+}
+
+} // namespace
+
+Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
+  // Require a primary context to already exist on the calling thread —
+  // the caller's job to set one up (e.g. by allocating any device memory
+  // through cupy / cudaMalloc).
+  CUcontext cur = nullptr;
+  checkCu(cuCtxGetCurrent(&cur), "cuCtxGetCurrent");
+  if (!cur) {
+    throw std::runtime_error(
+        "kun_cuda::Executable: no current CUDA context.  Initialise the "
+        "driver first (e.g. allocate any device memory via cupy or "
+        "cudaMalloc) before constructing an Executable.");
+  }
+  checkCu(cuModuleLoadData(&cuModule_, data_.cubin.data()),
+           "cuModuleLoadData");
+  checkCu(cuModuleGetFunction(&cuFunc_, cuModule_, data_.kernelName.c_str()),
+           "cuModuleGetFunction");
+}
+
+Executable::~Executable() {
+  // Best-effort unload; we deliberately don't propagate driver errors out
+  // of a destructor.
+  if (cuModule_)
+    cuModuleUnload(cuModule_);
+}
+
+void Executable::launch(
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+  // 1.  Resolve full ordered argument list (inputs first, then outputs).
+  std::vector<std::string> ordered;
+  ordered.reserve(data_.inputNames.size() + data_.outputNames.size());
+  for (auto &n : data_.inputNames)
+    ordered.push_back(n);
+  for (auto &n : data_.outputNames)
+    ordered.push_back(n);
+  if (ordered.empty())
+    throw std::runtime_error("kun_cuda::launch: kernel has no I/O args");
+
+  // 2.  Resolve each name to its device pointer — list is small, linear
+  //     scan is fine.
+  auto findArg = [&](const std::string &n) -> const uintptr_t * {
+    for (auto &kv : args)
+      if (kv.first == n)
+        return &kv.second;
+    return nullptr;
+  };
+
+  std::vector<uintptr_t> resolved;
+  resolved.reserve(ordered.size());
+  for (auto &n : ordered) {
+    auto *a = findArg(n);
+    if (!a) {
+      throw std::runtime_error("kun_cuda::launch: missing argument '" + n +
+                                "' (kernel expects: " + joinNames(ordered) +
+                                ")");
+    }
+    resolved.push_back(*a);
+  }
+
+  // 3.  Caller is responsible for shape consistency; we only check that
+  //     (T, S) fit in i32 since the kernel signature uses i32 i32.
+  if (timeLength > std::numeric_limits<int32_t>::max() ||
+      numStocks  > std::numeric_limits<int32_t>::max() ||
+      timeLength < 0 || numStocks < 0) {
+    throw std::runtime_error(
+        "kun_cuda::launch: time_length / num_stocks out of i32 range "
+        "(kernel signature uses i32, i32)");
+  }
+
+  // 4.  Build kernel argv: [i32 time_len, i32 num_stocks, ptr0, ptr1, ...]
+  int32_t timeLenI32   = static_cast<int32_t>(timeLength);
+  int32_t numStocksI32 = static_cast<int32_t>(numStocks);
+  std::vector<CUdeviceptr> ptrs(resolved.size());
+  for (size_t i = 0; i < resolved.size(); ++i)
+    ptrs[i] = static_cast<CUdeviceptr>(resolved[i]);
+
+  std::vector<void *> argPtrs;
+  argPtrs.reserve(2 + ptrs.size());
+  argPtrs.push_back(&timeLenI32);
+  argPtrs.push_back(&numStocksI32);
+  for (auto &p : ptrs)
+    argPtrs.push_back(&p);
+
+  // 5.  block / grid.
+  unsigned blockX = static_cast<unsigned>(data_.warpsPerCta * 32);
+  if (blockX == 0)
+    throw std::runtime_error("kun_cuda::launch: warps_per_cta is 0");
+  uint64_t stocksPerBlock =
+      static_cast<uint64_t>(blockX) * static_cast<uint64_t>(data_.vectorSize);
+  unsigned gridX = static_cast<unsigned>(
+      (static_cast<uint64_t>(numStocks) + stocksPerBlock - 1) / stocksPerBlock);
+
+  // sharedMemBytes = 0 — shared memory is static (declared as
+  // `llvm.mlir.global addr_space=3` and allocated by ptxas into the
+  // cubin's `.shared` section); the dynamic-smem launch parameter does
+  // not apply.
+  checkCu(cuLaunchKernel(cuFunc_, gridX, 1, 1, blockX, 1, 1,
+                           /*sharedMemBytes=*/0, /*stream=*/nullptr,
+                           argPtrs.data(), nullptr),
+           "cuLaunchKernel");
+  checkCu(cuCtxSynchronize(), "cuCtxSynchronize");
+}
+
+} // namespace kun_cuda
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index ebfe78b..e8a35e3 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -1,25 +1,3 @@
-# PtxBackend.cpp pulls in the NVPTX LLVM backend; it compiles to an
-# empty TU and we don't add the NVPTX-only deps when LLVM was configured
-# without NVPTX in LLVM_TARGETS_TO_BUILD.
-set(_KUN_HAS_NVPTX OFF)
-if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-  set(_KUN_HAS_NVPTX ON)
-endif()
-
-set(_KUN_KUNGPU_EXTRA_LIBS )
-if(_KUN_HAS_NVPTX)
-  list(APPEND _KUN_KUNGPU_EXTRA_LIBS
-    MLIRTargetLLVMIRExport
-    MLIRBuiltinToLLVMIRTranslation
-    MLIRLLVMToLLVMIRTranslation
-    MLIRNVVMToLLVMIRTranslation
-    MLIRExecutionEngineUtils
-  )
-else()
-  message(STATUS "[KunGpu] NVPTX target not in LLVM_TARGETS_TO_BUILD; "
-                  "kungpu::compileKunIrToPtx will be a stub.")
-endif()
-
 add_mlir_dialect_library(MLIRKunGpuDialect
   KunGpuDialect.cpp
   KunGpuOps.cpp
@@ -55,9 +33,12 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRIndexToLLVM
   MLIRGPUToNVVMTransforms
   MLIRNVVMDialect
-  ${_KUN_KUNGPU_EXTRA_LIBS}
-)
 
-if(_KUN_HAS_NVPTX)
-  target_compile_definitions(obj.MLIRKunGpuDialect PRIVATE KUN_HAS_NVPTX=1)
-endif()
+  # LLVM IR translation + PTX emission
+  MLIRTargetLLVMIRExport
+  MLIRBuiltinToLLVMIRTranslation
+  MLIRLLVMToLLVMIRTranslation
+  MLIRNVVMToLLVMIRTranslation
+  MLIRGPUToLLVMIRTranslation
+  MLIRExecutionEngineUtils
+)
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
index db9b212..0cb3dcb 100644
--- a/mlir/lib/KunGpu/PtxBackend.cpp
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -1,33 +1,141 @@
 //===- PtxBackend.cpp - Compile a kunir module all the way to PTX ------===//
 
 #include "KunGpu/PtxBackend.h"
+#include "KunGpu/KunGpuUtils.h"
 #include "KunGpu/Pipelines.h"
+#include "KunIr/KunIrAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 
-#ifndef KUN_HAS_NVPTX
-
-// LLVM was built without the NVPTX target.  Provide a stub so callers
-// still link, but compiling actual PTX is unavailable.
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
 
-#include "mlir/IR/BuiltinOps.h"
+#include <cstdlib>
 
 namespace kungpu {
-::mlir::LogicalResult compileKunIrToPtx(::mlir::ModuleOp module,
-                                          const PtxCompileOptions &,
-                                          std::string &) {
-  return module.emitError(
-      "compileKunIrToPtx: NVPTX target was not enabled in this LLVM build "
-      "(missing 'NVPTX' in LLVM_TARGETS_TO_BUILD).");
+
+namespace {
+
+/// Search for `ptxas` in the user-provided override, then PATH, then
+/// CUDA_HOME / CUDA_PATH / standard CUDA install locations.  Mirrors the
+/// search the upstream NVPTXSerializer does.
+static llvm::ErrorOr<std::string> findPtxas(::llvm::StringRef override) {
+  using namespace llvm::sys;
+  if (!override.empty() && fs::exists(override))
+    return std::string(override);
+  if (auto p = findProgramByName("ptxas"))
+    return p;
+  for (const char *envName : {"CUDA_HOME", "CUDA_PATH", "CUDA_TOOLKIT_PATH"}) {
+    if (const char *envVal = std::getenv(envName)) {
+      llvm::SmallString<256> p(envVal);
+      path::append(p, "bin", "ptxas");
+      if (fs::exists(p))
+        return std::string(p);
+    }
+  }
+  if (fs::exists("/usr/local/cuda/bin/ptxas"))
+    return std::string("/usr/local/cuda/bin/ptxas");
+  return std::make_error_code(std::errc::no_such_file_or_directory);
 }
-} // namespace kungpu
 
-#else  // KUN_HAS_NVPTX
+} // namespace
+
+::mlir::LogicalResult compilePtxToCubin(::llvm::StringRef ptx,
+                                          const PtxToCubinOptions &opts,
+                                          std::vector<char> &cubinOut,
+                                          std::string &errorMsg) {
+  using namespace llvm;
 
+  auto ptxasOrErr = findPtxas(opts.ptxasPath);
+  if (!ptxasOrErr) {
+    errorMsg = "compilePtxToCubin: ptxas not found "
+                "(looked in CUDA_HOME / CUDA_PATH / PATH / "
+                "/usr/local/cuda/bin); set ptxas_path or CUDA_HOME.";
+    return ::mlir::failure();
+  }
 
+  // Write PTX to a temp file.
+  SmallString<128> ptxPath, cubinPath, logPath;
+  if (auto ec = sys::fs::createTemporaryFile("kun-ptx", "ptx", ptxPath)) {
+    errorMsg = "compilePtxToCubin: createTemporaryFile(ptx): " + ec.message();
+    return ::mlir::failure();
+  }
+  if (auto ec = sys::fs::createTemporaryFile("kun-cubin", "cubin", cubinPath)) {
+    sys::fs::remove(ptxPath);
+    errorMsg = "compilePtxToCubin: createTemporaryFile(cubin): " + ec.message();
+    return ::mlir::failure();
+  }
+  if (auto ec = sys::fs::createTemporaryFile("kun-ptxlog", "log", logPath)) {
+    sys::fs::remove(ptxPath); sys::fs::remove(cubinPath);
+    errorMsg = "compilePtxToCubin: createTemporaryFile(log): " + ec.message();
+    return ::mlir::failure();
+  }
+
+  // Auto-cleanup.
+  struct CleanupOnExit {
+    SmallVectorImpl<char> &p; ~CleanupOnExit() { sys::fs::remove(p); }
+  };
+  CleanupOnExit c1{ptxPath}, c2{cubinPath}, c3{logPath};
+
+  {
+    std::error_code ec;
+    raw_fd_ostream os(ptxPath, ec, sys::fs::OF_None);
+    if (ec) {
+      errorMsg = "compilePtxToCubin: writing PTX: " + ec.message();
+      return ::mlir::failure();
+    }
+    os << ptx;
+  }
+
+  // Build argv:
+  //   ptxas --gpu-name=<sm_xx> -o <cubin> <ptx> [extra...]
+  std::string gpuArg = "--gpu-name=" + opts.gpuArch;
+  std::string outArg = "-o";
+  SmallVector<StringRef> argv = {*ptxasOrErr, gpuArg, outArg, cubinPath, ptxPath};
+  for (const auto &a : opts.extraArgs) argv.push_back(a);
+
+  std::string errBuf;
+  std::optional<StringRef> redirects[] = {std::nullopt,        // stdin
+                                            StringRef(logPath),  // stdout
+                                            StringRef(logPath)}; // stderr
+  int rc = sys::ExecuteAndWait(*ptxasOrErr, argv, /*Env=*/std::nullopt,
+                                 redirects, /*SecondsToWait=*/0,
+                                 /*MemoryLimit=*/0, &errBuf);
+  if (rc != 0) {
+    auto logBuf = MemoryBuffer::getFile(logPath);
+    errorMsg = "compilePtxToCubin: ptxas failed (exit " + std::to_string(rc) + ")";
+    if (!errBuf.empty()) errorMsg += ": " + errBuf;
+    if (logBuf && (*logBuf)->getBufferSize() > 0) {
+      errorMsg += "\n--- ptxas log ---\n";
+      errorMsg += (*logBuf)->getBuffer().str();
+    }
+    return ::mlir::failure();
+  }
+
+  auto cubinBuf = MemoryBuffer::getFile(cubinPath);
+  if (!cubinBuf) {
+    errorMsg = "compilePtxToCubin: cannot read cubin: " +
+                  cubinBuf.getError().message();
+    return ::mlir::failure();
+  }
+  StringRef bytes = (*cubinBuf)->getBuffer();
+  cubinOut.assign(bytes.begin(), bytes.end());
+  return ::mlir::success();
+}
+
+} // namespace kungpu
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Target/LLVMIR/Dialect/All.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
 #include "llvm/IR/LLVMContext.h"
@@ -83,14 +191,30 @@ LogicalResult compileKunIrToPtx(ModuleOp module,
         "compileKunIrToPtx: kunir-to-llvm pipeline failed");
 
   // ─── Step 2.  Translate MLIR LLVM dialect → llvm::Module ──────────
-  // Make sure NVVM (and friends) know how to emit themselves to LLVM IR.
+  // Register only the translations we actually need (builtin + LLVM +
+  // NVVM + GPU); the upstream `registerAllToLLVMIRTranslations` would
+  // pull in ArmSVE / SPIR-V / etc. and force us to link them all.
   DialectRegistry registry;
-  registerAllToLLVMIRTranslations(registry);
+  registerBuiltinDialectTranslation(registry);
+  registerLLVMDialectTranslation(registry);
+  registerNVVMDialectTranslation(registry);
+  registerGPUDialectTranslation(registry);
   ctx->appendDialectRegistry(registry);
 
+  // Mirror upstream `gpu-module-to-binary` / NVPTXSerializer: translate
+  // the gpu.module (the kernel container) rather than the outer
+  // builtin.module — only the gpu.module's body is meant to become LLVM
+  // IR.  We just take the first gpu.module; multi-module support can
+  // come later.
+  gpu::GPUModuleOp gpuMod;
+  module.walk([&](gpu::GPUModuleOp m) { gpuMod = m; return WalkResult::interrupt(); });
+  if (!gpuMod)
+    return module.emitError(
+        "compileKunIrToPtx: no gpu.module found after lowering");
+
   llvm::LLVMContext llvmCtx;
   std::unique_ptr<llvm::Module> llvmModule =
-      translateModuleToLLVMIR(module, llvmCtx);
+      translateModuleToLLVMIR(gpuMod, llvmCtx);
   if (!llvmModule)
     return module.emitError(
         "compileKunIrToPtx: translation to LLVM IR failed");
@@ -162,6 +286,56 @@ LogicalResult compileKunIrToPtx(ModuleOp module,
   return success();
 }
 
-} // namespace kungpu
+//===----------------------------------------------------------------------===//
+// All-in-one: kunir → cubin + metadata
+//===----------------------------------------------------------------------===//
+
+LogicalResult compileKunIrToExecutable(ModuleOp module,
+                                        const PtxCompileOptions &ptxOpts,
+                                        const PtxToCubinOptions &cubinOpts,
+                                        ::kun_cuda::ExecutableData &out) {
+  // 1.  Run the kunir → LLVM dialect pipeline + emit PTX.  This mutates
+  //     `module` in place so the discardable kunir metadata ends up on
+  //     the lowered llvm.func.
+  std::string ptx;
+  if (failed(compileKunIrToPtx(module, ptxOpts, ptx)))
+    return failure();
+
+  // 2.  Find the lowered kernel function (the one carrying our
+  //     kungpu.* discardable attributes) and pull metadata off it.
+  LLVM::LLVMFuncOp kernel;
+  module.walk([&](LLVM::LLVMFuncOp f) {
+    if (f->hasAttr(kFuncTargetSpecAttr)) {
+      kernel = f;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (!kernel)
+    return module.emitError(
+        "compileKunIrToExecutable: cannot find a llvm.func with kungpu "
+        "metadata in the lowered module");
+
+  out.kernelName = kernel.getSymName().str();
+  if (auto inNames = getFuncInputNames(kernel)) {
+    for (auto a : inNames)
+      out.inputNames.push_back(llvm::cast<StringAttr>(a).str());
+  }
+  if (auto outNames = getFuncOutputNames(kernel)) {
+    for (auto a : outNames)
+      out.outputNames.push_back(llvm::cast<StringAttr>(a).str());
+  }
+  if (auto ts = getFuncTargetSpec(kernel)) {
+    out.warpsPerCta = ts.getWarpsPerCta();
+    out.vectorSize  = ts.getVectorSize();
+  }
 
-#endif // KUN_HAS_NVPTX
+  // 3.  Assemble PTX → CUBIN.
+  std::string err;
+  if (failed(compilePtxToCubin(ptx, cubinOpts, out.cubin, err)))
+    return module.emitError("compileKunIrToExecutable: ") << err;
+
+  return success();
+}
+
+} // namespace kungpu
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
new file mode 100644
index 0000000..8901448
--- /dev/null
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Python binding module for the kunir → PTX → CUBIN → launch flow.
+#
+# pybind11 is added at the top-level KunQuant CMakeLists, so the
+# `pybind11_add_module` macro is already in scope.
+
+# pybind11 modules have undefined symbols (PyObject_*, PyExc_*, …) that
+# the Python interpreter resolves at module-load time.  MLIR's
+# HandleLLVMOptions adds `-Wl,-z,defs` globally, which is incompatible
+# with that policy.  Strip it locally for this subdirectory.
+string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
+string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
+
+pybind11_add_module(kun_mlir SHARED MlirBinding.cpp)
+
+# Co-locate the binding with libKunCudaRuntime.so so $ORIGIN
+# (CMAKE_INSTALL_RPATH set at top level) resolves dependencies as
+# siblings — same pattern as KunRunner ↔ KunRuntime.
+set_target_properties(kun_mlir PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/mlir/python")
+
+target_link_libraries(kun_mlir PRIVATE
+  # Compiler side
+  MLIRKunIrDialect
+  MLIRKunGpuDialect
+  MLIRKunIrToKunGpu
+
+  MLIRIR
+  MLIRParser
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  MLIRTransformUtils
+
+  MLIRArithDialect
+  MLIRControlFlowDialect
+  MLIRFuncDialect
+  MLIRGPUDialect
+  MLIRIndexDialect
+  MLIRLLVMDialect
+  MLIRMathDialect
+  MLIRNVVMDialect
+  MLIRSCFDialect
+
+  # Pulled in by the kunir-to-llvm pipeline that runs inside lower_to_ptx.
+  MLIRSCFToControlFlow
+  MLIRControlFlowToLLVM
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRIndexToLLVM
+  MLIRGPUToNVVMTransforms
+  MLIRReconcileUnrealizedCasts
+
+  # Runtime side — owns cuda.h + libcuda; we just hand it ExecutableData
+  # and call launch().
+  KunCudaRuntime
+)
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
new file mode 100644
index 0000000..c3b7d6a
--- /dev/null
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -0,0 +1,326 @@
+//===- MlirBinding.cpp - Python bindings for the kunir → PTX flow ------===//
+//
+// Exposes:
+//   kun_mlir.parse(text)            → ModuleOp     (loads MLIR text)
+//   ModuleOp.to_string() / __str__  → str          (dumps the module)
+//   kun_mlir.lower_to_ptx(mod, …)   → str          (kunir → PTX)
+//   kun_mlir.ptx_to_cubin(ptx, …)   → bytes        (PTX → CUBIN via ptxas)
+//   kun_mlir.compile(mod, …)        → Executable   (kunir → loadable kernel)
+//   Executable.launch({name: cupy}) → None         (cuLaunchKernel + sync)
+//
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/LLVM.h"
+
+// Dialect registrations
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+#include "KunCuda/Runtime.h"
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/PtxBackend.h"
+#include "KunIr/KunIrDialect.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// MLIR module wrapper
+//===----------------------------------------------------------------------===//
+
+class PyModule {
+public:
+  PyModule()
+      : ctx(std::make_unique<mlir::MLIRContext>(makeRegistry(),
+                                                  mlir::MLIRContext::Threading::DISABLED)) {
+    ctx->loadAllAvailableDialects();
+  }
+
+  static mlir::DialectRegistry makeRegistry() {
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::arith::ArithDialect>();
+    registry.insert<mlir::cf::ControlFlowDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+    registry.insert<mlir::gpu::GPUDialect>();
+    registry.insert<mlir::index::IndexDialect>();
+    registry.insert<mlir::LLVM::LLVMDialect>();
+    registry.insert<mlir::math::MathDialect>();
+    registry.insert<mlir::NVVM::NVVMDialect>();
+    registry.insert<mlir::scf::SCFDialect>();
+    registry.insert<kunir::KunIrDialect>();
+    registry.insert<kungpu::KunGpuDialect>();
+    return registry;
+  }
+
+  static std::unique_ptr<PyModule> parse(const std::string &text) {
+    auto pm = std::make_unique<PyModule>();
+    pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
+    if (!pm->module)
+      throw std::runtime_error("kun_mlir.parse: failed to parse MLIR text");
+    return pm;
+  }
+
+  std::string toString() const {
+    std::string out;
+    llvm::raw_string_ostream os(out);
+    module.get().print(os);
+    os.flush();
+    return out;
+  }
+
+  std::unique_ptr<mlir::MLIRContext> ctx;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+};
+
+//===----------------------------------------------------------------------===//
+// One-shot helpers
+//===----------------------------------------------------------------------===//
+
+static std::string pyLowerToPtx(PyModule &pm, const std::string &targetCpu,
+                                  const std::string &targetTriple,
+                                  const std::string &targetFeatures,
+                                  unsigned optLevel,
+                                  unsigned sizeLevel) {
+  kungpu::PtxCompileOptions opts;
+  if (!targetCpu.empty())      opts.targetCpu      = targetCpu;
+  if (!targetTriple.empty())   opts.targetTriple   = targetTriple;
+  if (!targetFeatures.empty()) opts.targetFeatures = targetFeatures;
+  opts.optLevel  = optLevel;
+  opts.sizeLevel = sizeLevel;
+
+  std::string ptx;
+  if (failed(kungpu::compileKunIrToPtx(pm.module.get(), opts, ptx)))
+    throw std::runtime_error("kun_mlir.lower_to_ptx failed");
+  return ptx;
+}
+
+static py::bytes pyPtxToCubin(const std::string &ptx,
+                                const std::string &gpuArch,
+                                const std::vector<std::string> &extraArgs,
+                                const std::string &ptxasPath) {
+  kungpu::PtxToCubinOptions opts;
+  if (!gpuArch.empty())   opts.gpuArch   = gpuArch;
+  if (!ptxasPath.empty()) opts.ptxasPath = ptxasPath;
+  opts.extraArgs = extraArgs;
+
+  std::vector<char> cubin;
+  std::string errMsg;
+  if (failed(kungpu::compilePtxToCubin(ptx, opts, cubin, errMsg)))
+    throw std::runtime_error(errMsg.empty() ? "kun_mlir.ptx_to_cubin failed"
+                                              : errMsg);
+  return py::bytes(cubin.data(), cubin.size());
+}
+
+//===----------------------------------------------------------------------===//
+// pybind glue: read CAI dict → kun_cuda::DeviceArray, build name list
+//===----------------------------------------------------------------------===//
+
+/// Read CAI from one Python GPU array.  Validates dtype + ndim; shape is
+/// returned to the caller for cross-array consistency checks.
+struct CudaArrayInfo {
+  uintptr_t ptr;
+  int64_t timeLength;   ///< shape[0]
+  int64_t numStocks;    ///< shape[1]
+};
+
+static CudaArrayInfo readCudaArray(py::handle obj,
+                                     const std::string &paramName) {
+  if (!py::hasattr(obj, "__cuda_array_interface__")) {
+    throw std::runtime_error(
+        "'" + paramName +
+        "' has no __cuda_array_interface__ — pass a CuPy ndarray (or any "
+        "GPU array implementing CAI).");
+  }
+  py::dict cai = obj.attr("__cuda_array_interface__").cast<py::dict>();
+
+  py::tuple data = cai["data"].cast<py::tuple>();
+  uintptr_t ptr  = data[0].cast<uintptr_t>();
+
+  std::vector<int64_t> shape;
+  for (py::handle s : cai["shape"].cast<py::tuple>())
+    shape.push_back(s.cast<int64_t>());
+  if (shape.size() != 2) {
+    std::stringstream ss;
+    ss << "'" << paramName << "' must be 2-D (got " << shape.size() << "-D)";
+    throw std::runtime_error(ss.str());
+  }
+
+  std::string typestr = cai["typestr"].cast<std::string>();
+  if (typestr != "<f4" && typestr != "|f4" && typestr != "=f4") {
+    throw std::runtime_error("'" + paramName +
+                              "' must be float32 little-endian (typestr "
+                              "'<f4'); got '" +
+                              typestr + "'");
+  }
+  return CudaArrayInfo{ptr, shape[0], shape[1]};
+}
+
+/// Walk the user's {name → cuda_array} dict, validate that every named
+/// arg is present and that all arrays share the same (timeLength,
+/// numStocks).  Returns the common (T, S) plus a flat list of (name, ptr)
+/// pairs.  Anything binding-side (CAI parsing, dtype/ndim/shape checks)
+/// happens here so the runtime stays a thin launcher.
+struct CollectedArgs {
+  int64_t timeLength;
+  int64_t numStocks;
+  std::vector<std::pair<std::string, uintptr_t>> args;
+};
+
+static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
+                                   py::dict pyArgs) {
+  std::vector<std::string> ordered;
+  ordered.reserve(exe.inputNames().size() + exe.outputNames().size());
+  for (auto &n : exe.inputNames())  ordered.push_back(n);
+  for (auto &n : exe.outputNames()) ordered.push_back(n);
+  if (ordered.empty())
+    throw std::runtime_error("launch: kernel has no I/O arguments");
+
+  CollectedArgs out;
+  out.args.reserve(ordered.size());
+
+  bool first = true;
+  for (const std::string &name : ordered) {
+    py::object key = py::str(name);
+    if (!pyArgs.contains(key)) {
+      std::string expected;
+      for (size_t i = 0; i < ordered.size(); ++i) {
+        if (i) expected += ", ";
+        expected += ordered[i];
+      }
+      throw std::runtime_error("launch: missing argument '" + name +
+                                "' (kernel expects: " + expected + ")");
+    }
+    CudaArrayInfo info = readCudaArray(pyArgs[key], name);
+    if (first) {
+      out.timeLength = info.timeLength;
+      out.numStocks  = info.numStocks;
+      first = false;
+    } else if (info.timeLength != out.timeLength ||
+                 info.numStocks  != out.numStocks) {
+      std::stringstream ss;
+      ss << "launch: shape mismatch on '" << name << "': expected ("
+         << out.timeLength << ", " << out.numStocks
+         << ") matching the first array, got ("
+         << info.timeLength << ", " << info.numStocks << ")";
+      throw std::runtime_error(ss.str());
+    }
+    out.args.emplace_back(name, info.ptr);
+  }
+  return out;
+}
+
+static std::unique_ptr<kun_cuda::Executable>
+pyCompile(PyModule &pm, const std::string &targetCpu,
+            const std::string &targetTriple,
+            const std::string &targetFeatures, unsigned optLevel,
+            unsigned sizeLevel, const std::string &ptxasPath) {
+  kungpu::PtxCompileOptions popts;
+  if (!targetCpu.empty())      popts.targetCpu      = targetCpu;
+  if (!targetTriple.empty())   popts.targetTriple   = targetTriple;
+  if (!targetFeatures.empty()) popts.targetFeatures = targetFeatures;
+  popts.optLevel  = optLevel;
+  popts.sizeLevel = sizeLevel;
+
+  kungpu::PtxToCubinOptions copts;
+  copts.gpuArch   = targetCpu.empty() ? "sm_80" : targetCpu;
+  copts.ptxasPath = ptxasPath;
+
+  kun_cuda::ExecutableData data;
+  if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), popts, copts,
+                                                data)))
+    throw std::runtime_error("kun_mlir.compile failed");
+  return std::make_unique<kun_cuda::Executable>(std::move(data));
+}
+
+} // namespace
+
+PYBIND11_MODULE(kun_mlir, m) {
+  m.doc() = "Bindings for the KunQuant MLIR compiler (kunir → PTX → CUBIN "
+             "→ launch).";
+
+  py::class_<PyModule>(m, "ModuleOp")
+      .def("to_string", &PyModule::toString,
+            "Return the textual MLIR form of the module.")
+      .def("__str__",  &PyModule::toString)
+      .def("__repr__", [](const PyModule &m) {
+        return "<kun_mlir.ModuleOp>\n" + m.toString();
+      });
+
+  m.def("parse", &PyModule::parse, py::arg("text"),
+         "Parse an MLIR text fragment into a ModuleOp.");
+
+  m.def("lower_to_ptx", &pyLowerToPtx,
+         py::arg("module"),
+         py::arg("target_cpu")     = "sm_80",
+         py::arg("target_triple")  = "nvptx64-nvidia-cuda",
+         py::arg("target_features") = "",
+         py::arg("opt_level")      = 3u,
+         py::arg("size_level")     = 0u,
+         "Lower kunir → PTX text.  Returns a Python str.");
+
+  m.def("ptx_to_cubin", &pyPtxToCubin,
+         py::arg("ptx"),
+         py::arg("gpu_arch")   = "sm_80",
+         py::arg("extra_args") = std::vector<std::string>{},
+         py::arg("ptxas_path") = "",
+         "Assemble PTX → CUBIN via ptxas.  Returns bytes.");
+
+  py::class_<kun_cuda::Executable>(m, "Executable")
+      .def_property_readonly("kernel_name",   &kun_cuda::Executable::kernelName)
+      .def_property_readonly("input_names",   &kun_cuda::Executable::inputNames)
+      .def_property_readonly("output_names",  &kun_cuda::Executable::outputNames)
+      .def_property_readonly("warps_per_cta", &kun_cuda::Executable::warpsPerCta)
+      .def_property_readonly("vector_size",   &kun_cuda::Executable::vectorSize)
+      .def_property_readonly("cubin",
+            [](const kun_cuda::Executable &e) {
+              const auto &b = e.data().cubin;
+              return py::bytes(b.data(), b.size());
+            })
+      .def("launch",
+            [](kun_cuda::Executable &e, py::dict pyArgs) {
+              auto c = collectArgs(e, pyArgs);
+              e.launch(c.timeLength, c.numStocks, c.args);
+            },
+            py::arg("args"),
+            "Launch the kernel.  `args` is a {name → cupy_array} dict; "
+            "names must match input_names ++ output_names.  All arrays "
+            "must be float32, 2-D, shape (time_length, num_stocks) — TS "
+            "layout — and reside on the GPU.");
+
+  m.def("compile", &pyCompile,
+         py::arg("module"),
+         py::arg("target_cpu")     = "sm_80",
+         py::arg("target_triple")  = "nvptx64-nvidia-cuda",
+         py::arg("target_features") = "",
+         py::arg("opt_level")      = 3u,
+         py::arg("size_level")     = 0u,
+         py::arg("ptxas_path")     = "",
+         "Compile a kunir module all the way to a loaded Executable "
+         "(kunir → LLVM dialect → LLVM IR → PTX → CUBIN → cuModuleLoad).");
+}
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
new file mode 100644
index 0000000..d4e3300
--- /dev/null
+++ b/mlir/test/python/test_kun_mlir.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""End-to-end test for the `kun_mlir` Python bindings.
+
+  parse → to_string → lower_to_ptx → ptx_to_cubin → compile → launch
+
+Usage:
+    PATH=$CUDA_BIN:$PATH PYTHONPATH=<build>/mlir/lib/Python \
+        kun python test_kun_mlir.py [--target sm_120]
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+import textwrap
+
+
+SAMPLE_KUNIR = textwrap.dedent("""
+gpu.module @kungpu_kernels {
+  kunir.func @test_addsum(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"sum"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+}
+""").strip()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default="sm_120",
+                     help="GPU compute capability (e.g. sm_120, sm_90, sm_80)")
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    args = ap.parse_args()
+
+    import kun_mlir
+    import cupy as cp
+    import numpy as np
+
+    # Force-initialise the CUDA driver + create the primary context now,
+    # so subsequent kun_mlir.compile() / Executable.launch() find one.
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    print(f"=== parse + to_string ===")
+    mod = kun_mlir.parse(SAMPLE_KUNIR)
+    text = mod.to_string()
+    assert "kunir.func @test_addsum" in text, "module text missing kunir.func"
+    print("ok — module round-trips through parse/to_string")
+
+    print()
+    print(f"=== lower_to_ptx (target={args.target}, O3) ===")
+    ptx = kun_mlir.lower_to_ptx(mod, target_cpu=args.target, opt_level=3)
+    assert "test_addsum" in ptx
+    print(f"ok — produced {len(ptx)} bytes of PTX text")
+
+    print()
+    print(f"=== ptx_to_cubin ({args.target}) ===")
+    cubin = kun_mlir.ptx_to_cubin(ptx, gpu_arch=args.target)
+    assert isinstance(cubin, bytes) and cubin[:4] == b"\x7fELF"
+    print(f"ok — produced {len(cubin)} bytes of CUBIN (ELF magic verified)")
+
+    print()
+    print(f"=== compile (all-in-one) ===")
+    # `mod` was already mutated by lower_to_ptx above; re-parse so compile()
+    # gets a fresh kunir.func module.
+    mod2 = kun_mlir.parse(SAMPLE_KUNIR)
+    exe = kun_mlir.compile(mod2, target_cpu=args.target, opt_level=3)
+    print(f"  kernel_name   = {exe.kernel_name}")
+    print(f"  input_names   = {exe.input_names}")
+    print(f"  output_names  = {exe.output_names}")
+    print(f"  warps_per_cta = {exe.warps_per_cta}")
+    print(f"  vector_size   = {exe.vector_size}")
+    print(f"  cubin bytes   = {len(exe.cubin)}")
+    assert exe.kernel_name == "test_addsum"
+    assert exe.input_names  == ["a", "b"]
+    assert exe.output_names == ["sum"]
+    assert exe.warps_per_cta == 4
+    assert exe.vector_size   == 1
+
+    print()
+    print(f"=== launch ({args.time_length} × {args.num_stocks}) ===")
+    T, S = args.time_length, args.num_stocks
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    a   = cp.asarray(a_h)
+    b   = cp.asarray(b_h)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    exe.launch({"a": a, "b": b, "sum": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+    expected = a_h + b_h
+    if not np.allclose(out_h, expected, atol=1e-5):
+        diff = np.abs(out_h - expected)
+        print(f"FAIL — max abs diff {diff.max()}, "
+                f"argmax @ {np.unravel_index(diff.argmax(), diff.shape)}",
+                file=sys.stderr)
+        return 1
+    print(f"ok — output matches a + b on every (t, s) cell")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
new file mode 100644
index 0000000..2904d50
--- /dev/null
+++ b/mlir/test/python/test_windowed_temp.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""End-to-end test for the windowed_temp lowering across both placements
+the memory-planning pass can choose:
+
+    sum_window(a, b, N)[t][s] = sum_{i=0}^{N-1} ( a[t-i][s] + b[t-i][s] )
+
+is compiled twice — once with a small N (fits in shared memory) and once
+with a large N (spills to local memory) — and each run is checked
+against a numpy reference.  Memory-planning's per-block budget for our
+target_spec is
+
+    bytes / windowed_temp = N * (warps_per_cta * 32) * vector_size * 4
+                          = N * 128 * 4         (warps_per_cta = 4)
+                          = 512 * N
+
+so with smem_size = 49152 the cutoff is N ≤ 96 → smem, N > 96 → local.
+We pick N = 5 and N = 200 to bracket that.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+import textwrap
+
+import numpy as np
+
+
+def build_ir(N: int, warps_per_cta: int = 4, smem_size: int = 49152) -> str:
+    """A minimal kunir program that computes a rolling sum of (a + b)."""
+    return textwrap.dedent(f"""
+gpu.module @kungpu_kernels {{
+  kunir.func @sum_window(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {{%a = "a", %b = "b"}}
+      outputs {{"out"}}
+      target {{occupancy = 1, warps_per_cta = {warps_per_cta}, smem_size = {smem_size}, vector_size = 1}}
+      -> !kunir.ts<f32, 1> {{
+    %c = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    %w = kunir.windowed_output %c [length = {N}] : !kunir.ts<f32, 1> -> !kunir.ts<f32, {N}>
+    %total = kunir.for_each_back_window
+        (%w : !kunir.ts<f32, {N}>) [window = {N}]
+        (%cur : !kunir.ts<f32, 1>)
+        -> (!kunir.ts<f32, 1>) {{
+      %s = kunir.reduce_add %cur : !kunir.ts<f32, 1>
+      kunir.yield %s : !kunir.ts<f32, 1>
+    }}
+    kunir.return %total : !kunir.ts<f32, 1>
+  }}
+}}
+""").strip()
+
+
+def reference_sum_window(a: np.ndarray, b: np.ndarray, N: int) -> np.ndarray:
+    """CPU rolling-sum of (a + b) with window length N along axis 0.
+    Output for t < N-1 is undefined; we fill nan there and skip it
+    when comparing.
+    """
+    c = a + b
+    T, S = c.shape
+    out = np.empty((T, S), dtype=np.float32)
+    out[:N - 1] = np.nan
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)  # higher-precision ref
+    out[N - 1] = cumsum[N - 1]
+    if T > N:
+        out[N:] = (cumsum[N:] - cumsum[:-N])
+    return out
+
+
+def assert_planning(N: int, warps_per_cta: int, smem_size: int,
+                     expected: str) -> None:
+    """Sanity-check our N choices against the memory-planning formula
+    so the test self-documents which placement it exercises."""
+    bytes_per_buf = N * warps_per_cta * 32 * 1 * 4   # vector_size=1, f32
+    fits_smem = bytes_per_buf <= smem_size
+    actual = "smem" if fits_smem else "local"
+    if actual != expected:
+        raise AssertionError(
+            f"N={N} ({bytes_per_buf} bytes) would land in '{actual}', "
+            f"but the test wanted '{expected}' (smem budget {smem_size}).")
+
+
+def run_one(N: int, expected_placement: str, target: str,
+              warps_per_cta: int = 4, smem_size: int = 49152,
+              T: int = 64, S: int = 2048) -> int:
+    import kun_mlir
+    import cupy as cp
+
+    print(f"=== N = {N}  ({expected_placement} temp buffer) ===")
+    assert_planning(N, warps_per_cta, smem_size, expected_placement)
+
+    ir = build_ir(N, warps_per_cta=warps_per_cta, smem_size=smem_size)
+    mod = kun_mlir.parse(ir)
+    exe = kun_mlir.compile(mod, target_cpu=target, opt_level=3)
+    print(f"  kernel={exe.kernel_name}  warps_per_cta={exe.warps_per_cta}  "
+           f"vector_size={exe.vector_size}  cubin={len(exe.cubin)} bytes")
+
+    # Random input.  T must be > N so we have at least one valid window.
+    if T <= N:
+        T = N + 32
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    a   = cp.asarray(a_h)
+    b   = cp.asarray(b_h)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    exe.launch({"a": a, "b": b, "out": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+
+    expected = reference_sum_window(a_h, b_h, N)
+
+    # Only the t >= N-1 region is well-defined.
+    diff = np.abs(out_h[N - 1:] - expected[N - 1:])
+    max_abs = float(diff.max())
+    # Tolerance scales with N: each output is a sum of N IID N(0,1)
+    # samples, so its magnitude is ~sqrt(N), and float32 ULP-style error
+    # accumulates roughly like N * eps.
+    atol = max(1e-3, 5e-7 * N)
+    if max_abs > atol:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL: max |Δ| = {max_abs:.3e} > {atol:.0e} at "
+               f"{idx} (out_h={out_h[N-1:][idx]:.6g} vs "
+               f"expected={expected[N-1:][idx]:.6g})", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e}, "
+           f"shape={(T - N + 1, S)} validated cells)")
+    return 0
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    args = ap.parse_args()
+
+    import cupy as cp
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    rc |= run_one(N=5,   expected_placement="smem",
+                   target=args.target, T=args.time_length, S=args.num_stocks)
+    print()
+    rc |= run_one(N=200, expected_placement="local",
+                   target=args.target,
+                   T=max(args.time_length, 256), S=args.num_stocks)
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 443b85ed6e3185ac7d677585915817065a4791c6 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 7 May 2026 01:38:12 -0700
Subject: [PATCH 09/59] fix unaligned stocks

---
 mlir/lib/KunGpu/KunGpuToLLVM.cpp    | 89 +++++++++++++++++++++++++----
 mlir/test/kungpu/lower_to_llvm.mlir | 21 ++++++-
 mlir/test/python/test_kun_mlir.py   | 56 +++++++++++-------
 3 files changed, 131 insertions(+), 35 deletions(-)

diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 59a7400..175a4aa 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -85,14 +85,48 @@ struct WTDesc {
 };
 using WTDescMap = llvm::DenseMap<Value, WTDesc>;
 
+//===----------------------------------------------------------------------===//
+// Helper: stock_id = blockIdx.x * blockDim.x + threadIdx.x  (index-typed)
+// Defined here so phase 1 (`convertFuncSignature` below) can reuse it
+// for the active-thread guard, in addition to the conversion patterns.
+//===----------------------------------------------------------------------===//
+
+static Value emitStockId(OpBuilder &b, Location loc, Type idxTy) {
+  Value tid  = b.create<gpu::ThreadIdOp>(loc, idxTy, gpu::Dimension::x);
+  Value bid  = b.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::x);
+  Value bdim = b.create<gpu::BlockDimOp>(loc, idxTy, gpu::Dimension::x);
+  return b.create<arith::AddIOp>(
+      loc, b.create<arith::MulIOp>(loc, bid, bdim), tid);
+}
+
 //===----------------------------------------------------------------------===//
 // Phase 1: kunir.func → func.func (signature only)
 //===----------------------------------------------------------------------===//
 
-static void convertFuncSignature(kunir::FuncOp fn) {
-  auto *ctx = fn.getContext();
+static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
+  auto *ctx  = fn.getContext();
   Location loc = fn.getLoc();
   auto i32Ty = IntegerType::get(ctx, 32);
+  auto idxTy = IndexType::get(ctx);
+
+  // We only support vector_size = 1 right now.  When vector_size > 1 a
+  // single thread handles `vector_size` consecutive stocks; if those
+  // straddle the num_stocks boundary, the kernel either has to:
+  //   - clamp the lane index to min(base + k, num_stocks - 1) on
+  //     every gmem load (safe re-read), and per-lane predicate the
+  //     gmem stores to skip the out-of-range cells;
+  //   - or refuse non-aligned num_stocks at launch time.
+  // TODO(vector_size>1): implement the clamp scheme above and remove
+  // this check.  See discussion in KunGpuToLLVM history for why
+  // PTX vector loads can't mask individual lanes.
+  auto tsAttr = fn.getTargetSpecAttr();
+  int64_t vectorSize = tsAttr ? tsAttr.getVectorSize() : 1;
+  if (vectorSize != 1) {
+    return fn.emitError("convert-kungpu-to-llvm: vector_size = ")
+            << vectorSize << " not yet supported (only vector_size = 1). "
+            << "TODO: implement clamp + per-lane store predicate for the "
+            << "tail block.";
+  }
 
   FunctionType oldFT = fn.getFunctionTypeTyped();
   SmallVector<Type> newArgTypes = {i32Ty, i32Ty};
@@ -128,20 +162,52 @@ static void convertFuncSignature(kunir::FuncOp fn) {
     r.erase();
   }
   fn.erase();
+
+  // ── Tail-block guard ────────────────────────────────────────────────
+  // grid_x is sized as ceil(num_stocks / block_x), so the last block
+  // contains threads with stock_id ≥ num_stocks.  Without a guard those
+  // threads do gmem GEPs at out-of-bounds addresses (UB).  Compute
+  // stock_id at the top of the kernel and wrap the original body in
+  // `scf.if (stock_id < num_stocks)`.  Inactive threads fall through to
+  // gpu.return without touching gmem; their smem column is sized to the
+  // block (not num_stocks), so leaving it uninitialised is safe.
+  //
+  // For vector_size = 1 this is the entire fix; vector_size > 1 is
+  // gated above.
+  Operation *gpuRet = entry.getTerminator();
+  Operation *origFirst = entry.empty() ? nullptr : &entry.front();
+  if (!origFirst || origFirst == gpuRet) {
+    // Empty body — nothing to guard.
+    return success();
+  }
+
+  OpBuilder pb(ctx);
+  pb.setInsertionPointToStart(&entry);
+  Value sidIdx = emitStockId(pb, loc, idxTy);
+  Value sidI32 = pb.create<arith::IndexCastOp>(loc, i32Ty, sidIdx);
+  Value numStocks = entry.getArgument(1); // i32
+  Value active = pb.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
+                                            sidI32, numStocks);
+  auto ifOp = pb.create<scf::IfOp>(loc, /*resultTypes=*/TypeRange{},
+                                     active, /*withElseRegion=*/false);
+
+  // Move all original ops (everything between the prologue we just
+  // inserted and the gpu.return) into the scf.if's then-region, before
+  // its implicit scf.yield.
+  Block &thenBlk = ifOp.getThenRegion().front();
+  Operation *thenYield = thenBlk.getTerminator();
+  thenBlk.getOperations().splice(thenYield->getIterator(),
+                                   entry.getOperations(),
+                                   origFirst->getIterator(),
+                                   gpuRet->getIterator());
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
 // Helpers used inside conversion patterns
 //===----------------------------------------------------------------------===//
 
-static Value emitStockId(OpBuilder &b, Location loc, Type idxTy) {
-  Value tid  = b.create<gpu::ThreadIdOp>(loc, idxTy, gpu::Dimension::x);
-  Value bid  = b.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::x);
-  Value bdim = b.create<gpu::BlockDimOp>(loc, idxTy, gpu::Dimension::x);
-  return b.create<arith::AddIOp>(
-      loc, b.create<arith::MulIOp>(loc, bid, bdim), tid);
-}
-
 // Read num_stocks (i32 func arg[1]) sign-extended to i64 for the linear gmem
 // address computation.  The bare i32 value is in arg[1]; we extend at every
 // use site (cheap, and lets the caller decide).
@@ -449,7 +515,8 @@ struct ConvertKunGpuToLLVMPass
       SmallVector<kunir::FuncOp> kfns;
       module.walk([&](kunir::FuncOp fn) { kfns.push_back(fn); });
       for (kunir::FuncOp fn : kfns)
-        convertFuncSignature(fn);
+        if (failed(convertFuncSignature(fn)))
+          return signalPassFailure();
     }
 
     // ── Phase 2 ────────────────────────────────────────────────────────
diff --git a/mlir/test/kungpu/lower_to_llvm.mlir b/mlir/test/kungpu/lower_to_llvm.mlir
index 2b28772..0107ec4 100644
--- a/mlir/test/kungpu/lower_to_llvm.mlir
+++ b/mlir/test/kungpu/lower_to_llvm.mlir
@@ -30,11 +30,24 @@ gpu.module @kungpu_kernels {
 // CHECK-SAME:    kungpu.output_names = ["out"]
 // CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
 //
+// ── Active-thread guard prologue ──────────────────────────────────────
+// Computes stock_id = bid*bdim + tid, compares with %num_stocks, then
+// wraps the original kernel body in scf.if so threads with
+// stock_id ≥ num_stocks fall straight through to gpu.return.
+// CHECK:       %[[TID:.*]]  = gpu.thread_id  x
+// CHECK:       %[[BID:.*]]  = gpu.block_id   x
+// CHECK:       %[[BDIM:.*]] = gpu.block_dim  x
+// CHECK:       %[[BTB:.*]]  = arith.muli %[[BID]], %[[BDIM]]
+// CHECK:       %[[SID:.*]]  = arith.addi %[[BTB]], %[[TID]]
+// CHECK:       %[[SIDI:.*]] = arith.index_cast %[[SID]] : index to i32
+// CHECK:       %[[ACTIVE:.*]] = arith.cmpi slt, %[[SIDI]], %[[NS]] : i32
+// CHECK:       scf.if %[[ACTIVE]] {
+//
 // time_length → arith.index_cast of arg0 (i32 → index)
-// CHECK:       %[[TLIDX:.*]] = arith.index_cast %[[TL]] : i32 to index
-// CHECK:       %[[OFFCST:.*]] = arith.constant 0 : i32
+// CHECK:         %[[TLIDX:.*]] = arith.index_cast %[[TL]] : i32 to index
+// CHECK:         %[[OFFCST:.*]] = arith.constant 0 : i32
 //
-// CHECK:       scf.for %[[T:.*]] = %{{.*}} to %[[TLIDX]] step %{{.*}}
+// CHECK:         scf.for %[[T:.*]] = %{{.*}} to %[[TLIDX]] step %{{.*}}
 //
 // ── ts.get on global %in at offset 0 ───────────────────────────────────
 // effective time = t − 0; stock_id = bid*bdim + tid; lin = effT*ns + sid.
@@ -61,6 +74,8 @@ gpu.module @kungpu_kernels {
 // CHECK:         %[[LIN2:.*]] = arith.addi %[[ROW2]],
 // CHECK:         %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
 // CHECK:         llvm.store %[[V]], %[[GEP2]]
+// scf.if + gpu.return: inactive threads (sid ≥ ns) skip the body and
+// arrive at gpu.return directly.
 // CHECK:       gpu.return
 kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index d4e3300..fc0cf2d 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -81,28 +81,42 @@ def main() -> int:
     assert exe.warps_per_cta == 4
     assert exe.vector_size   == 1
 
-    print()
-    print(f"=== launch ({args.time_length} × {args.num_stocks}) ===")
-    T, S = args.time_length, args.num_stocks
+    # Run the kernel for two num_stocks values:
+    #  - one that's a multiple of (warps_per_cta * 32 * vector_size) — no
+    #    tail block;
+    #  - one that isn't — exercises the active-thread guard inserted by
+    #    convert-kungpu-to-llvm phase 1.
+    block_x = exe.warps_per_cta * 32 * exe.vector_size
     rng = np.random.default_rng(0)
-    a_h = rng.standard_normal((T, S), dtype=np.float32)
-    b_h = rng.standard_normal((T, S), dtype=np.float32)
-    a   = cp.asarray(a_h)
-    b   = cp.asarray(b_h)
-    out = cp.zeros((T, S), dtype=cp.float32)
-
-    exe.launch({"a": a, "b": b, "sum": out})
-    cp.cuda.runtime.deviceSynchronize()
-    out_h = cp.asnumpy(out)
-    expected = a_h + b_h
-    if not np.allclose(out_h, expected, atol=1e-5):
-        diff = np.abs(out_h - expected)
-        print(f"FAIL — max abs diff {diff.max()}, "
-                f"argmax @ {np.unravel_index(diff.argmax(), diff.shape)}",
-                file=sys.stderr)
-        return 1
-    print(f"ok — output matches a + b on every (t, s) cell")
-    return 0
+    rc = 0
+    for label, S in [("aligned", args.num_stocks),
+                      ("unaligned (tail block)",
+                       args.num_stocks + (block_x // 2 + 7))]:
+        T = args.time_length
+        print()
+        is_aligned = (S % block_x == 0)
+        print(f"=== launch ({T} × {S}) — {label}, "
+               f"S % {block_x} = {S % block_x}, "
+               f"aligned={is_aligned} ===")
+        a_h = rng.standard_normal((T, S), dtype=np.float32)
+        b_h = rng.standard_normal((T, S), dtype=np.float32)
+        a   = cp.asarray(a_h)
+        b   = cp.asarray(b_h)
+        out = cp.zeros((T, S), dtype=cp.float32)
+        exe.launch({"a": a, "b": b, "sum": out})
+        cp.cuda.runtime.deviceSynchronize()
+        out_h = cp.asnumpy(out)
+        expected = a_h + b_h
+        if not np.allclose(out_h, expected, atol=1e-5):
+            diff = np.abs(out_h - expected)
+            print(f"  FAIL — max abs diff {diff.max()}, "
+                    f"argmax @ {np.unravel_index(diff.argmax(), diff.shape)}",
+                    file=sys.stderr)
+            rc = 1
+        else:
+            print(f"  ok — output matches a + b on every (t, s) cell "
+                   f"({T*S} cells)")
+    return rc
 
 
 if __name__ == "__main__":

From 82177f8ff891748eca37aa09dedefaebe5bef9e1 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 7 May 2026 20:58:34 -0700
Subject: [PATCH 10/59] multi-kernel

---
 mlir/include/KunCuda/Runtime.h         | 154 +++++--
 mlir/include/KunGpu/PtxBackend.h       |  19 +-
 mlir/lib/KunCuda/Runtime.cpp           | 543 ++++++++++++++++++++++---
 mlir/lib/KunGpu/PtxBackend.cpp         |  84 ++--
 mlir/lib/Python/MlirBinding.cpp        |  59 ++-
 mlir/test/python/test_kun_mlir.py      |  27 +-
 mlir/test/python/test_multi_kernel.py  | 139 +++++++
 mlir/test/python/test_windowed_temp.py |   7 +-
 8 files changed, 884 insertions(+), 148 deletions(-)
 create mode 100644 mlir/test/python/test_multi_kernel.py

diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index 9d6f1c4..b8b8ee9 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -1,20 +1,41 @@
 //===- Runtime.h - kun_cuda runtime: ExecutableData + Executable -------===//
 //
 // Pure runtime piece, decoupled from the MLIR compiler and the Python
-// binding.  The compiler produces an `ExecutableData` (cubin + metadata),
-// the runtime turns that into a loaded `Executable` (cuModuleLoadData +
-// cuModuleGetFunction) and launches it.
+// binding.  The compiler produces an `ExecutableData` (one cubin holding
+// N kernels + per-kernel I/O *names* + the user's graph_inputs /
+// graph_outputs lists).  The `Executable` ctor turns that into a loaded
+// kernel set plus a fully resolved schedule:
 //
-// This header forward-declares the two opaque CUDA Driver types it
-// stores by pointer (CUmodule / CUfunction) so consumers don't need to
-// pull in <cuda.h>.  These typedefs match cuda.h's verbatim — they have
-// been ABI-stable for two decades.
+//   names → buffer indices  ──→  topo sort  ──→  slot plan
+//
+// This split keeps the *compiler* concerned only with what's in the
+// cubin, and lets the *runtime* own everything that's really a graph
+// concern (dependency analysis, schedule, memory plan).  When we add
+// CUDA-graph support later, all the input it needs already lives in the
+// runtime: per-kernel buffer indices, the producer-kernel-of-each-buffer
+// map, and the intermediate slot mapping.
+//
+// Buffer-table layout (assigned at Executable-construction time):
+//   indices [0 .. numGraphInputs)             → graph inputs
+//   indices [numGraphInputs .. firstInter)    → graph outputs
+//   indices [firstInter .. numBuffers)        → intermediates
+//
+// Memory planning:
+//   Intermediates share a pre-allocated slot pool sized to
+//   `peakIntermediateSlots`.  Slot reuse is computed by refcount + LIFO
+//   free pool over the topo-sorted schedule.  Slots are allocated lazily
+//   on the first launch (and re-allocated if `(timeLength, numStocks)`
+//   changes), then reused across subsequent launches with the same shape.
+//
+// This header forward-declares the two opaque CUDA Driver types so
+// consumers don't need to pull in <cuda.h>.
 //
 //===----------------------------------------------------------------------===//
 
 #pragma once
 
 #include <cstdint>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -26,26 +47,62 @@ typedef struct CUfunc_st *CUfunction;
 
 namespace kun_cuda {
 
-/// Everything needed to load + launch a compiled kunir kernel.  No CUDA
-/// types — kept POD so the compiler library can populate it without
-/// depending on cuda.h.
+/// Internal: the resolved schedule + memory plan.  Forward-declared so
+/// the public header doesn't have to expose buffer-index tables,
+/// producer maps, etc.  Fully defined in Runtime.cpp.
+struct GraphPlan;
+
+//===----------------------------------------------------------------------===//
+// Compile-time output (all names — runtime resolves them to indices)
+//===----------------------------------------------------------------------===//
+
+/// Per-kernel metadata, in name form.  This is what the compiler can
+/// produce by walking a single lowered llvm.func — no graph topology
+/// reasoning required.
+struct KernelMeta {
+  std::string kernelName;                    ///< symbol in the cubin
+  std::vector<std::string> inputNames;       ///< kungpu.input_names, in argv order
+  std::vector<std::string> outputNames;      ///< kungpu.output_names, in argv order
+};
+
+/// What the compiler hands the runtime: a cubin + the kernels it
+/// contains, declared purely by name.  `graphInputs` / `graphOutputs`
+/// are user-supplied: they pick which named buffers cross the
+/// graph-runtime boundary; everything else a kernel produces is treated
+/// as an intermediate.
 struct ExecutableData {
-  std::vector<char> cubin;             ///< raw cubin bytes (ELF)
-  std::string kernelName;              ///< symbol name in the cubin
-  std::vector<std::string> inputNames; ///< from kungpu.input_names
-  std::vector<std::string> outputNames;///< from kungpu.output_names
-  int64_t warpsPerCta = 1;             ///< from kungpu.target_spec
-  int64_t vectorSize  = 1;             ///< stocks-per-thread, from target_spec
+  std::vector<char> cubin;
+  int64_t warpsPerCta = 1;          ///< from kungpu.target_spec (graph-wide)
+  int64_t vectorSize  = 1;          ///< from kungpu.target_spec (graph-wide)
+  std::vector<KernelMeta> kernels;  ///< unordered set; runtime topo-sorts
+  std::vector<std::string> graphInputs;
+  std::vector<std::string> graphOutputs;
 };
 
-/// RAII wrapper around a loaded cubin + resolved kernel function.
-/// Construction calls `cuModuleLoadData` + `cuModuleGetFunction` on the
-/// CUDA primary context of the calling thread (which must already exist).
-/// Destruction calls `cuModuleUnload`.
+//===----------------------------------------------------------------------===//
+// Executable
+//===----------------------------------------------------------------------===//
+
+/// RAII wrapper around a loaded cubin + the resolved graph plan.
+///
+/// Construction:
+///   1. Resolve names → buffer indices (graphInputs first, graphOutputs
+///      next, intermediates last).
+///   2. Build per-kernel int-index I/O lists and a producer-of-each-buffer
+///      table.
+///   3. Validate the graph (single producer; every consumer either a
+///      graph input or has a producer; every graph output is produced).
+///   4. Kahn topo sort over kernel-to-kernel edges.
+///   5. Slot plan via refcount + LIFO free pool.
+///   6. cuModuleLoadData + cuModuleGetFunction × N on the calling
+///      thread's primary CUDA context (which must already exist).
+///
+/// Destruction calls `cuModuleUnload` and frees the slot pool.
 class Executable {
 public:
-  /// Throws std::runtime_error on driver errors or missing CUDA context.
-  /// Takes an rvalue — caller `std::move`s the data in.
+  /// Throws std::runtime_error on driver errors, missing CUDA context,
+  /// or graph-validation failures.  Takes an rvalue — caller `std::move`s
+  /// the data in.
   explicit Executable(ExecutableData &&data);
   ~Executable();
 
@@ -56,33 +113,62 @@ class Executable {
   Executable(Executable &&)                 = delete;
   Executable &operator=(Executable &&)      = delete;
 
+  // ── Accessors (compile-time data) ─────────────────────────────────
   const ExecutableData &data() const noexcept { return data_; }
-  const std::string &kernelName() const noexcept { return data_.kernelName; }
-  const std::vector<std::string> &inputNames()  const noexcept { return data_.inputNames; }
-  const std::vector<std::string> &outputNames() const noexcept { return data_.outputNames; }
+  const std::vector<std::string> &graphInputs()  const noexcept { return data_.graphInputs; }
+  const std::vector<std::string> &graphOutputs() const noexcept { return data_.graphOutputs; }
   int64_t warpsPerCta() const noexcept { return data_.warpsPerCta; }
   int64_t vectorSize()  const noexcept { return data_.vectorSize; }
+  size_t  numKernels()  const noexcept { return data_.kernels.size(); }
+
+  // ── Accessors (runtime-resolved plan) ─────────────────────────────
+  // Defined out-of-line so the header doesn't need GraphPlan's layout.
 
-  /// Launch the kernel.  `timeLength` / `numStocks` describe the kernel
-  /// invocation as a whole — the caller is responsible for verifying all
-  /// device buffers are sized accordingly (TS layout: `(t, s)` at
-  /// `ptr + (t*numStocks + s) * sizeof(T)`).
+  /// Topo-sorted indices into `data().kernels` — the order the runtime
+  /// launches kernels on the single CUDA stream.
+  const std::vector<int> &launchOrder() const noexcept;
+  /// Total buffer-table slots = numGraphInputs + numGraphOutputs +
+  /// (number of distinct intermediates produced by kernels).
+  int  numBuffers()            const noexcept;
+  /// Number of physical intermediate buffers actually allocated by the
+  /// runtime (after slot reuse).
+  int  peakIntermediateSlots() const noexcept;
+
+  /// Launch every kernel in `launchOrder` on the default stream.
   ///
-  /// `args` keys must equal `inputNames ++ outputNames` (order doesn't
-  /// matter, names are looked up).  Grid configuration:
+  /// `args` keys must equal `graphInputs ++ graphOutputs` (order
+  /// doesn't matter; the runtime hashes them into the buffer table).
+  /// Intermediate buffers are owned by the executable and reused across
+  /// launches with matching `(timeLength, numStocks)`.
   ///
+  /// Grid configuration (per kernel — identical because warps_per_cta
+  /// and vector_size are graph-wide):
   ///   block_x = warps_per_cta * 32
   ///   grid_x  = ceil_div(numStocks, block_x * vector_size)
   ///
-  /// Synchronous on the default stream.  Throws std::runtime_error on
-  /// validation or driver errors.
+  /// Synchronous: `cuCtxSynchronize` is called once after the last
+  /// kernel.  Throws std::runtime_error on validation or driver errors.
   void launch(int64_t timeLength, int64_t numStocks,
               const std::vector<std::pair<std::string, uintptr_t>> &args);
 
 private:
+  /// Allocate (or re-allocate, if shape changed) the intermediate slot
+  /// pool.  Each slot holds one `T × S` float32 array.
+  void ensureSlotPool(int64_t timeLength, int64_t numStocks);
+  /// Free all slot allocations.  Called from dtor and on shape change.
+  void freeSlotPool();
+
   ExecutableData data_;
+  std::unique_ptr<GraphPlan> plan_;          ///< pImpl — defined in Runtime.cpp
+
   CUmodule cuModule_ = nullptr;
-  CUfunction cuFunc_ = nullptr;
+  std::vector<CUfunction> cuFuncs_;          ///< parallel to data_.kernels
+
+  // Lazily allocated intermediate buffers, one CUdeviceptr per slot
+  // (stored as uintptr_t to keep the header CUDA-free).
+  std::vector<uintptr_t> slotBufs_;
+  int64_t cachedT_ = -1;
+  int64_t cachedS_ = -1;
 };
 
 } // namespace kun_cuda
diff --git a/mlir/include/KunGpu/PtxBackend.h b/mlir/include/KunGpu/PtxBackend.h
index 4b26bea..01e51a3 100644
--- a/mlir/include/KunGpu/PtxBackend.h
+++ b/mlir/include/KunGpu/PtxBackend.h
@@ -86,11 +86,20 @@ ::mlir::LogicalResult compilePtxToCubin(::llvm::StringRef ptx,
                                           std::vector<char> &cubinOut,
                                           std::string &errorMsg);
 
-/// All-in-one: run the kunir → LLVM dialect pipeline, translate to LLVM
-/// IR, optimize, emit PTX, assemble to CUBIN, and pull the kernel
-/// metadata (name + I/O argument names + target-spec fields) off the
-/// lowered function so callers can hand the result to
-/// `kun_cuda::Executable` without re-walking the IR.
+/// Compile-only: run the kunir → LLVM dialect pipeline, translate to
+/// LLVM IR, optimize, emit PTX, assemble to CUBIN, then walk the
+/// lowered module to populate the per-kernel name metadata (one
+/// `KernelMeta` per `llvm.func` carrying `kungpu.target_spec`).  The
+/// caller is expected to fill in `out.graphInputs` / `out.graphOutputs`
+/// before constructing a `kun_cuda::Executable` from the result —
+/// graph topology is a runtime concern, not a compile-time one.
+///
+/// On success `out` is populated with: cubin, warpsPerCta, vectorSize
+/// (validated to be uniform across kernels), and the unordered list of
+/// kernels (each with its name and the input/output names from
+/// `kungpu.input_names` / `kungpu.output_names`).  Topology validation,
+/// topo sort, buffer indexing and slot planning all happen later, in
+/// the `Executable` ctor.
 ///
 /// The module is mutated in-place by the pipeline (same as
 /// `compileKunIrToPtx`).
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index 961e058..65f0f6c 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -1,4 +1,21 @@
 //===- Runtime.cpp - kun_cuda::Executable implementation ---------------===//
+//
+// The ctor pipeline is split into focused helpers — each step is small
+// enough to read top-to-bottom on its own:
+//
+//   buildBufferIndices   — assign integer indices to every named buffer
+//   resolveKernelIO      — translate per-kernel name lists to indices,
+//                           build producer-of-each-buffer table
+//   validateGraph        — check single producer, all consumers reachable,
+//                           graph_outputs all produced, no self-dependency
+//   topoSort             — Kahn's algorithm; rejects cycles
+//   planSlots            — refcount + LIFO free pool over the topo order
+//
+// All helpers live in this file's anonymous namespace.  Future
+// CUDA-graph support reuses the same plan: `kernelInputBufs` +
+// `producerKernel` are exactly the dep edges cuGraph needs.
+//
+//===----------------------------------------------------------------------===//
 
 #include "KunCuda/Runtime.h"
 
@@ -7,11 +24,53 @@
 #include <limits>
 #include <sstream>
 #include <stdexcept>
+#include <unordered_map>
 
 namespace kun_cuda {
 
+//===----------------------------------------------------------------------===//
+// GraphPlan — pImpl payload, hidden from the public header
+//===----------------------------------------------------------------------===//
+
+/// Runtime-resolved schedule + memory plan.  All buffer references here
+/// are integer indices into the flat buffer table.  Storing
+/// `producerKernel` makes it cheap to re-derive kernel-to-kernel
+/// dependency edges (needed for future cuGraph support: kernel K's deps
+/// = {producerKernel[b] for b in kernelInputBufs[K], filtered to ≥ 0}).
+struct GraphPlan {
+  int numBuffers       = 0;
+  int numGraphInputs   = 0;
+  int numGraphOutputs  = 0;
+
+  // Name → index for the user-facing args dict.  Other lookups happen
+  // by integer indexing.
+  std::unordered_map<std::string, int> graphInputIdx;
+  std::unordered_map<std::string, int> graphOutputIdx;
+
+  // Per-kernel I/O resolved to buffer indices.  Parallel to ExecutableData::kernels.
+  std::vector<std::vector<int>> kernelInputBufs;
+  std::vector<std::vector<int>> kernelOutputBufs;
+
+  // producerKernel[bufIdx] = kernel that writes the buffer, or -1 if
+  // the buffer is a graph input.
+  std::vector<int> producerKernel;
+
+  // Topo order — a single valid linearization for the v0 single-stream
+  // launcher.
+  std::vector<int> launchOrder;
+
+  // Slot assignment: one entry per buffer index.  -1 if the buffer is a
+  // graph input/output; otherwise a slot index in [0, peakIntermediateSlots).
+  std::vector<int> intermediateBufToSlot;
+  int peakIntermediateSlots = 0;
+};
+
 namespace {
 
+//===----------------------------------------------------------------------===//
+// CUDA driver helpers
+//===----------------------------------------------------------------------===//
+
 void checkCu(CUresult r, const char *what) {
   if (r == CUDA_SUCCESS)
     return;
@@ -24,99 +83,433 @@ void checkCu(CUresult r, const char *what) {
 std::string joinNames(const std::vector<std::string> &v) {
   std::string r;
   for (size_t i = 0; i < v.size(); ++i) {
-    if (i)
-      r += ", ";
+    if (i) r += ", ";
     r += v[i];
   }
   return r;
 }
 
+//===----------------------------------------------------------------------===//
+// Plan-building helpers — small POD intermediates so each helper is
+// independent and trivially testable.
+//===----------------------------------------------------------------------===//
+
+struct BufTable {
+  int numBuffers      = 0;
+  int numGraphInputs  = 0;
+  int numGraphOutputs = 0;
+  // Name → index for *every* buffer (graph IO + intermediates).  Used by
+  // resolveKernelIO; the per-role maps below are kept around for the
+  // launch-time user args dict lookup.
+  std::unordered_map<std::string, int> nameToIdx;
+  std::unordered_map<std::string, int> graphInputIdx;
+  std::unordered_map<std::string, int> graphOutputIdx;
+};
+
+struct KernelIO {
+  std::vector<std::vector<int>> kernelInputBufs;   // [kernel][argv pos]
+  std::vector<std::vector<int>> kernelOutputBufs;
+  // producerKernel[bufIdx] = kernel index that writes that buffer, or
+  // -1 if it's a graph input.
+  std::vector<int> producerKernel;
+};
+
+struct SlotPlan {
+  std::vector<int> intermediateBufToSlot;
+  int peakIntermediateSlots = 0;
+};
+
+/// Step 1 — assign buffer indices.  Layout:
+///   [0 .. numGraphInputs)                        graph inputs
+///   [numGraphInputs .. numGraphInputs+numGraphOutputs)  graph outputs
+///   [..numBuffers)                               intermediates
+/// Intermediates are everything a kernel produces that isn't a
+/// graph_output; they get consecutive indices in first-seen order.
+BufTable buildBufferIndices(const std::vector<std::string> &graphInputs,
+                              const std::vector<std::string> &graphOutputs,
+                              const std::vector<KernelMeta> &kernels) {
+  BufTable t;
+
+  for (const auto &n : graphInputs) {
+    if (t.nameToIdx.count(n))
+      throw std::runtime_error(
+          "kun_cuda::Executable: duplicate name in graph_inputs: '" + n + "'");
+    int idx = static_cast<int>(t.nameToIdx.size());
+    t.nameToIdx[n] = idx;
+    t.graphInputIdx[n] = idx;
+  }
+  t.numGraphInputs = static_cast<int>(t.nameToIdx.size());
+
+  for (const auto &n : graphOutputs) {
+    if (t.nameToIdx.count(n))
+      throw std::runtime_error(
+          "kun_cuda::Executable: name '" + n +
+          "' appears in both graph_inputs and graph_outputs (or twice in "
+          "one of them)");
+    int idx = static_cast<int>(t.nameToIdx.size());
+    t.nameToIdx[n] = idx;
+    t.graphOutputIdx[n] = idx;
+  }
+  t.numGraphOutputs =
+      static_cast<int>(t.nameToIdx.size()) - t.numGraphInputs;
+
+  // Walk every kernel output and assign new indices to anything we
+  // haven't seen yet (intermediates).  We don't validate single-producer
+  // here — that's `validateGraph`'s job — but we do need to avoid
+  // double-assigning if two kernels share an output name.
+  for (const auto &k : kernels)
+    for (const auto &outName : k.outputNames)
+      if (!t.nameToIdx.count(outName))
+        t.nameToIdx[outName] = static_cast<int>(t.nameToIdx.size());
+
+  t.numBuffers = static_cast<int>(t.nameToIdx.size());
+  return t;
+}
+
+/// Step 2 — resolve each kernel's I/O name list to int indices, plus
+/// build the producer-of-each-buffer table.  Throws on a kernel input
+/// that wasn't declared anywhere (neither graph input nor produced).
+KernelIO resolveKernelIO(const std::vector<KernelMeta> &kernels,
+                           const BufTable &tbl) {
+  KernelIO kio;
+  kio.kernelInputBufs.resize(kernels.size());
+  kio.kernelOutputBufs.resize(kernels.size());
+  kio.producerKernel.assign(tbl.numBuffers, -1);
+
+  for (int kIdx = 0; kIdx < static_cast<int>(kernels.size()); ++kIdx) {
+    const auto &k = kernels[kIdx];
+
+    kio.kernelInputBufs[kIdx].reserve(k.inputNames.size());
+    for (const auto &n : k.inputNames) {
+      auto it = tbl.nameToIdx.find(n);
+      if (it == tbl.nameToIdx.end())
+        throw std::runtime_error(
+            "kun_cuda::Executable: kernel '" + k.kernelName + "' consumes '" +
+            n + "' which is neither a graph_input nor produced by any kernel");
+      kio.kernelInputBufs[kIdx].push_back(it->second);
+    }
+
+    kio.kernelOutputBufs[kIdx].reserve(k.outputNames.size());
+    for (const auto &n : k.outputNames) {
+      // Index existence is guaranteed by buildBufferIndices.
+      int b = tbl.nameToIdx.at(n);
+      kio.kernelOutputBufs[kIdx].push_back(b);
+      kio.producerKernel[b] = kIdx;  // last writer wins; validateGraph
+                                      // catches multi-producer below.
+    }
+  }
+  return kio;
+}
+
+/// Step 3 — graph-level validation.  Catches the cases buildBufferIndices /
+/// resolveKernelIO can't, namely:
+///   * two kernels claim to produce the same buffer
+///   * a graph_output is declared but never produced
+///   * a graph_input is also produced by a kernel (overlap is silly)
+void validateGraph(const std::vector<KernelMeta> &kernels,
+                     const std::vector<std::string> &graphOutputs,
+                     const BufTable &tbl,
+                     const KernelIO &kio) {
+  // Multi-producer: count how many times each output name appears as a
+  // kernel output.
+  std::unordered_map<std::string, int> outCounts;
+  std::unordered_map<std::string, std::string> firstProducer;
+  for (const auto &k : kernels) {
+    for (const auto &n : k.outputNames) {
+      if (++outCounts[n] == 1)
+        firstProducer[n] = k.kernelName;
+      else if (outCounts[n] == 2)
+        throw std::runtime_error(
+            "kun_cuda::Executable: name '" + n +
+            "' is produced by both kernel '" + firstProducer[n] +
+            "' and kernel '" + k.kernelName + "'");
+    }
+  }
+
+  // graph_outputs must be produced.
+  for (const auto &n : graphOutputs) {
+    int b = tbl.nameToIdx.at(n);
+    if (kio.producerKernel[b] < 0)
+      throw std::runtime_error(
+          "kun_cuda::Executable: graph_output '" + n +
+          "' is not produced by any kernel");
+  }
+
+  // graph_inputs must NOT be produced by any kernel — an input is by
+  // definition supplied by the caller.
+  for (const auto &kv : tbl.graphInputIdx) {
+    if (kio.producerKernel[kv.second] >= 0)
+      throw std::runtime_error(
+          "kun_cuda::Executable: graph_input '" + kv.first +
+          "' is also produced by a kernel; use a different name for the "
+          "kernel output");
+  }
+}
+
+/// Step 4 — Kahn topological sort over kernel-to-kernel edges.  An edge
+/// `producer → consumer` exists whenever consumer reads any buffer that
+/// producer writes.  Multi-edges between the same pair count as one.
+/// Throws on cycle.
+std::vector<int> topoSort(const KernelIO &kio, int numKernels) {
+  std::vector<int> indeg(numKernels, 0);
+  std::vector<std::vector<int>> succ(numKernels);
+
+  // Build edges, deduped per (producer, consumer) pair.
+  for (int kIdx = 0; kIdx < numKernels; ++kIdx) {
+    std::vector<int> deps;
+    for (int b : kio.kernelInputBufs[kIdx]) {
+      int p = kio.producerKernel[b];
+      if (p < 0) continue;                   // graph input
+      if (p == kIdx)
+        throw std::runtime_error(
+            "kun_cuda::Executable: kernel index " + std::to_string(kIdx) +
+            " depends on its own output");
+      bool seen = false;
+      for (int d : deps) if (d == p) { seen = true; break; }
+      if (!seen) deps.push_back(p);
+    }
+    indeg[kIdx] = static_cast<int>(deps.size());
+    for (int p : deps) succ[p].push_back(kIdx);
+  }
+
+  std::vector<int> order;
+  order.reserve(numKernels);
+  std::vector<int> ready;
+  for (int i = 0; i < numKernels; ++i)
+    if (indeg[i] == 0) ready.push_back(i);
+  while (!ready.empty()) {
+    int k = ready.back();
+    ready.pop_back();
+    order.push_back(k);
+    for (int n : succ[k])
+      if (--indeg[n] == 0)
+        ready.push_back(n);
+  }
+  if (static_cast<int>(order.size()) != numKernels)
+    throw std::runtime_error(
+        "kun_cuda::Executable: cycle detected in kernel dependency graph");
+  return order;
+}
+
+/// Step 5 — slot allocation for intermediates.  Refcount = number of
+/// kernel-input slots that reference the buffer, plus +1 for graph
+/// outputs (so we never try to recycle them).  Walking the topo order:
+///   * before launching kernel K, allocate a fresh slot for each
+///     intermediate output of K (drawn from the LIFO free pool when
+///     possible),
+///   * after, decrement refcounts on K's inputs; any intermediate that
+///     hits zero returns its slot to the free pool.
+SlotPlan planSlots(const std::vector<int> &launchOrder,
+                    const BufTable &tbl,
+                    const KernelIO &kio) {
+  SlotPlan plan;
+  plan.intermediateBufToSlot.assign(tbl.numBuffers, -1);
+  const int firstIntermediate = tbl.numGraphInputs + tbl.numGraphOutputs;
+
+  // Initial refcounts.
+  std::vector<int> refcount(tbl.numBuffers, 0);
+  for (const auto &ins : kio.kernelInputBufs)
+    for (int b : ins)
+      refcount[b]++;
+  // graph_outputs are externally visible — pin them so we never try to
+  // reuse them (they don't have slots anyway, but this keeps the loop
+  // free of special cases).
+  for (int i = tbl.numGraphInputs; i < firstIntermediate; ++i)
+    refcount[i]++;
+
+  std::vector<int> freePool;
+  int nextNew = 0;
+
+  auto allocSlot = [&]() -> int {
+    if (!freePool.empty()) { int s = freePool.back(); freePool.pop_back(); return s; }
+    int s = nextNew++;
+    if (nextNew > plan.peakIntermediateSlots) plan.peakIntermediateSlots = nextNew;
+    return s;
+  };
+
+  for (int kIdx : launchOrder) {
+    // Allocate slots for this kernel's intermediate outputs.  Outputs
+    // that ARE graph_outputs use caller-owned buffers and don't need a
+    // slot.
+    for (int b : kio.kernelOutputBufs[kIdx]) {
+      if (b < firstIntermediate) continue;
+      plan.intermediateBufToSlot[b] = allocSlot();
+    }
+    // Decrement refcounts on inputs; intermediate slots whose refcount
+    // hits zero return to the free pool.
+    for (int b : kio.kernelInputBufs[kIdx]) {
+      if (--refcount[b] == 0 && b >= firstIntermediate) {
+        int s = plan.intermediateBufToSlot[b];
+        if (s >= 0) freePool.push_back(s);
+      }
+    }
+  }
+  return plan;
+}
+
 } // namespace
 
+//===----------------------------------------------------------------------===//
+// Executable
+//===----------------------------------------------------------------------===//
+
 Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
   // Require a primary context to already exist on the calling thread —
   // the caller's job to set one up (e.g. by allocating any device memory
   // through cupy / cudaMalloc).
   CUcontext cur = nullptr;
   checkCu(cuCtxGetCurrent(&cur), "cuCtxGetCurrent");
-  if (!cur) {
+  if (!cur)
     throw std::runtime_error(
         "kun_cuda::Executable: no current CUDA context.  Initialise the "
         "driver first (e.g. allocate any device memory via cupy or "
         "cudaMalloc) before constructing an Executable.");
-  }
+  if (data_.kernels.empty())
+    throw std::runtime_error(
+        "kun_cuda::Executable: ExecutableData has no kernels");
+  if (data_.graphInputs.empty())
+    throw std::runtime_error(
+        "kun_cuda::Executable: graph_inputs must be non-empty");
+  if (data_.graphOutputs.empty())
+    throw std::runtime_error(
+        "kun_cuda::Executable: graph_outputs must be non-empty");
+
+  // ── Build the runtime plan ───────────────────────────────────────
+  BufTable tbl  = buildBufferIndices(data_.graphInputs, data_.graphOutputs,
+                                       data_.kernels);
+  KernelIO kio  = resolveKernelIO(data_.kernels, tbl);
+  validateGraph(data_.kernels, data_.graphOutputs, tbl, kio);
+  std::vector<int> order = topoSort(kio, static_cast<int>(data_.kernels.size()));
+  SlotPlan slots = planSlots(order, tbl, kio);
+
+  plan_ = std::make_unique<GraphPlan>();
+  plan_->numBuffers          = tbl.numBuffers;
+  plan_->numGraphInputs      = tbl.numGraphInputs;
+  plan_->numGraphOutputs     = tbl.numGraphOutputs;
+  plan_->graphInputIdx       = std::move(tbl.graphInputIdx);
+  plan_->graphOutputIdx      = std::move(tbl.graphOutputIdx);
+  plan_->kernelInputBufs     = std::move(kio.kernelInputBufs);
+  plan_->kernelOutputBufs    = std::move(kio.kernelOutputBufs);
+  plan_->producerKernel      = std::move(kio.producerKernel);
+  plan_->launchOrder         = std::move(order);
+  plan_->intermediateBufToSlot = std::move(slots.intermediateBufToSlot);
+  plan_->peakIntermediateSlots = slots.peakIntermediateSlots;
+
+  // ── Load the cubin and resolve every kernel symbol ───────────────
   checkCu(cuModuleLoadData(&cuModule_, data_.cubin.data()),
            "cuModuleLoadData");
-  checkCu(cuModuleGetFunction(&cuFunc_, cuModule_, data_.kernelName.c_str()),
-           "cuModuleGetFunction");
+  cuFuncs_.resize(data_.kernels.size(), nullptr);
+  for (size_t i = 0; i < data_.kernels.size(); ++i)
+    checkCu(cuModuleGetFunction(&cuFuncs_[i], cuModule_,
+                                 data_.kernels[i].kernelName.c_str()),
+             "cuModuleGetFunction");
 }
 
 Executable::~Executable() {
-  // Best-effort unload; we deliberately don't propagate driver errors out
-  // of a destructor.
+  // Best-effort cleanup; we deliberately don't propagate driver errors
+  // out of a destructor.
+  freeSlotPool();
   if (cuModule_)
     cuModuleUnload(cuModule_);
 }
 
-void Executable::launch(
-    int64_t timeLength, int64_t numStocks,
-    const std::vector<std::pair<std::string, uintptr_t>> &args) {
-  // 1.  Resolve full ordered argument list (inputs first, then outputs).
-  std::vector<std::string> ordered;
-  ordered.reserve(data_.inputNames.size() + data_.outputNames.size());
-  for (auto &n : data_.inputNames)
-    ordered.push_back(n);
-  for (auto &n : data_.outputNames)
-    ordered.push_back(n);
-  if (ordered.empty())
-    throw std::runtime_error("kun_cuda::launch: kernel has no I/O args");
-
-  // 2.  Resolve each name to its device pointer — list is small, linear
-  //     scan is fine.
-  auto findArg = [&](const std::string &n) -> const uintptr_t * {
-    for (auto &kv : args)
-      if (kv.first == n)
-        return &kv.second;
-    return nullptr;
-  };
+void Executable::freeSlotPool() {
+  for (uintptr_t p : slotBufs_)
+    if (p) cuMemFree(static_cast<CUdeviceptr>(p));
+  slotBufs_.clear();
+  cachedT_ = -1;
+  cachedS_ = -1;
+}
 
-  std::vector<uintptr_t> resolved;
-  resolved.reserve(ordered.size());
-  for (auto &n : ordered) {
-    auto *a = findArg(n);
-    if (!a) {
-      throw std::runtime_error("kun_cuda::launch: missing argument '" + n +
-                                "' (kernel expects: " + joinNames(ordered) +
-                                ")");
-    }
-    resolved.push_back(*a);
+void Executable::ensureSlotPool(int64_t timeLength, int64_t numStocks) {
+  if (timeLength == cachedT_ && numStocks == cachedS_ &&
+      static_cast<int>(slotBufs_.size()) == plan_->peakIntermediateSlots)
+    return;
+  freeSlotPool();
+  if (plan_->peakIntermediateSlots == 0) {
+    cachedT_ = timeLength;
+    cachedS_ = numStocks;
+    return;
   }
+  size_t bytesPerSlot = static_cast<size_t>(timeLength) *
+                          static_cast<size_t>(numStocks) * sizeof(float);
+  slotBufs_.resize(plan_->peakIntermediateSlots, 0);
+  for (int i = 0; i < plan_->peakIntermediateSlots; ++i) {
+    CUdeviceptr p = 0;
+    checkCu(cuMemAlloc(&p, bytesPerSlot), "cuMemAlloc(intermediate slot)");
+    slotBufs_[i] = static_cast<uintptr_t>(p);
+  }
+  cachedT_ = timeLength;
+  cachedS_ = numStocks;
+}
+
+//===----------------------------------------------------------------------===//
+// Out-of-line plan accessors (header forward-declares GraphPlan)
+//===----------------------------------------------------------------------===//
 
-  // 3.  Caller is responsible for shape consistency; we only check that
-  //     (T, S) fit in i32 since the kernel signature uses i32 i32.
+const std::vector<int> &Executable::launchOrder() const noexcept {
+  return plan_->launchOrder;
+}
+int Executable::numBuffers() const noexcept { return plan_->numBuffers; }
+int Executable::peakIntermediateSlots() const noexcept {
+  return plan_->peakIntermediateSlots;
+}
+
+void Executable::launch(
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+  // 1.  Shape sanity (kernel signature is i32 i32).
   if (timeLength > std::numeric_limits<int32_t>::max() ||
       numStocks  > std::numeric_limits<int32_t>::max() ||
-      timeLength < 0 || numStocks < 0) {
+      timeLength < 0 || numStocks < 0)
     throw std::runtime_error(
         "kun_cuda::launch: time_length / num_stocks out of i32 range "
         "(kernel signature uses i32, i32)");
+
+  // 2.  Allocate intermediate slot pool if needed.
+  ensureSlotPool(timeLength, numStocks);
+
+  // 3.  Resolve user args into the flat buffer table.  Two hash lookups
+  //     per user arg, that's it.
+  std::vector<uintptr_t> bufPtrs(plan_->numBuffers, 0);
+  std::vector<bool>      filled(plan_->numBuffers, false);
+
+  for (const auto &kv : args) {
+    auto itIn = plan_->graphInputIdx.find(kv.first);
+    auto itOut = plan_->graphOutputIdx.find(kv.first);
+    int idx = -1;
+    if (itIn != plan_->graphInputIdx.end())
+      idx = itIn->second;
+    else if (itOut != plan_->graphOutputIdx.end())
+      idx = itOut->second;
+    else
+      throw std::runtime_error(
+          "kun_cuda::launch: unexpected argument '" + kv.first +
+          "' (expected: " + joinNames(data_.graphInputs) + " | " +
+          joinNames(data_.graphOutputs) + ")");
+    bufPtrs[idx] = kv.second;
+    filled[idx] = true;
   }
 
-  // 4.  Build kernel argv: [i32 time_len, i32 num_stocks, ptr0, ptr1, ...]
-  int32_t timeLenI32   = static_cast<int32_t>(timeLength);
-  int32_t numStocksI32 = static_cast<int32_t>(numStocks);
-  std::vector<CUdeviceptr> ptrs(resolved.size());
-  for (size_t i = 0; i < resolved.size(); ++i)
-    ptrs[i] = static_cast<CUdeviceptr>(resolved[i]);
-
-  std::vector<void *> argPtrs;
-  argPtrs.reserve(2 + ptrs.size());
-  argPtrs.push_back(&timeLenI32);
-  argPtrs.push_back(&numStocksI32);
-  for (auto &p : ptrs)
-    argPtrs.push_back(&p);
-
-  // 5.  block / grid.
+  // 4.  Confirm every graph input + output was supplied.
+  for (int i = 0; i < plan_->numGraphInputs + plan_->numGraphOutputs; ++i) {
+    if (filled[i]) continue;
+    std::string missing;
+    for (auto &kv : plan_->graphInputIdx)  if (kv.second == i) missing = kv.first;
+    if (missing.empty())
+      for (auto &kv : plan_->graphOutputIdx) if (kv.second == i) missing = kv.first;
+    throw std::runtime_error(
+        "kun_cuda::launch: missing argument '" + missing + "'");
+  }
+
+  // 5.  Fill intermediate slots from the pre-allocated pool.
+  for (int i = plan_->numGraphInputs + plan_->numGraphOutputs;
+        i < plan_->numBuffers; ++i) {
+    int slot = plan_->intermediateBufToSlot[i];
+    bufPtrs[i] = slotBufs_[slot];
+  }
+
+  // 6.  Launch each kernel in topo order.
   unsigned blockX = static_cast<unsigned>(data_.warpsPerCta * 32);
   if (blockX == 0)
     throw std::runtime_error("kun_cuda::launch: warps_per_cta is 0");
@@ -125,14 +518,34 @@ void Executable::launch(
   unsigned gridX = static_cast<unsigned>(
       (static_cast<uint64_t>(numStocks) + stocksPerBlock - 1) / stocksPerBlock);
 
-  // sharedMemBytes = 0 — shared memory is static (declared as
-  // `llvm.mlir.global addr_space=3` and allocated by ptxas into the
-  // cubin's `.shared` section); the dynamic-smem launch parameter does
-  // not apply.
-  checkCu(cuLaunchKernel(cuFunc_, gridX, 1, 1, blockX, 1, 1,
-                           /*sharedMemBytes=*/0, /*stream=*/nullptr,
-                           argPtrs.data(), nullptr),
-           "cuLaunchKernel");
+  int32_t timeLenI32   = static_cast<int32_t>(timeLength);
+  int32_t numStocksI32 = static_cast<int32_t>(numStocks);
+
+  for (int kIdx : plan_->launchOrder) {
+    const auto &ins  = plan_->kernelInputBufs[kIdx];
+    const auto &outs = plan_->kernelOutputBufs[kIdx];
+
+    std::vector<CUdeviceptr> ptrs;
+    ptrs.reserve(ins.size() + outs.size());
+    for (int b : ins)  ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
+    for (int b : outs) ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
+
+    std::vector<void *> argPtrs;
+    argPtrs.reserve(2 + ptrs.size());
+    argPtrs.push_back(&timeLenI32);
+    argPtrs.push_back(&numStocksI32);
+    for (auto &p : ptrs) argPtrs.push_back(&p);
+
+    // sharedMemBytes = 0 — shared memory is static (declared as
+    // `llvm.mlir.global addr_space=3` and allocated by ptxas into the
+    // cubin's `.shared` section); the dynamic-smem launch parameter does
+    // not apply.
+    checkCu(cuLaunchKernel(cuFuncs_[kIdx], gridX, 1, 1, blockX, 1, 1,
+                             /*sharedMemBytes=*/0, /*stream=*/nullptr,
+                             argPtrs.data(), nullptr),
+             "cuLaunchKernel");
+  }
+
   checkCu(cuCtxSynchronize(), "cuCtxSynchronize");
 }
 
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
index 0cb3dcb..e078db8 100644
--- a/mlir/lib/KunGpu/PtxBackend.cpp
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -287,7 +287,11 @@ LogicalResult compileKunIrToPtx(ModuleOp module,
 }
 
 //===----------------------------------------------------------------------===//
-// All-in-one: kunir → cubin + metadata
+// All-in-one: kunir → cubin + per-kernel name metadata
+//
+// Compile-time concerns only.  Topology / topo sort / buffer indices /
+// slot planning all happen later, in `kun_cuda::Executable`'s ctor —
+// see KunCuda/Runtime.h.
 //===----------------------------------------------------------------------===//
 
 LogicalResult compileKunIrToExecutable(ModuleOp module,
@@ -296,45 +300,73 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
                                         ::kun_cuda::ExecutableData &out) {
   // 1.  Run the kunir → LLVM dialect pipeline + emit PTX.  This mutates
   //     `module` in place so the discardable kunir metadata ends up on
-  //     the lowered llvm.func.
+  //     each lowered llvm.func.
   std::string ptx;
   if (failed(compileKunIrToPtx(module, ptxOpts, ptx)))
     return failure();
 
-  // 2.  Find the lowered kernel function (the one carrying our
-  //     kungpu.* discardable attributes) and pull metadata off it.
-  LLVM::LLVMFuncOp kernel;
+  // 2.  Walk every kernel function (carries kungpu.target_spec) and
+  //     emit a KernelMeta with names and target spec.
+  std::vector<::kun_cuda::KernelMeta> kernels;
+  std::vector<std::pair<int64_t, int64_t>> targetSpecs;  // (warps, vector)
+  std::vector<std::string> targetSpecOwners;             // for diagnostics
+
   module.walk([&](LLVM::LLVMFuncOp f) {
-    if (f->hasAttr(kFuncTargetSpecAttr)) {
-      kernel = f;
-      return WalkResult::interrupt();
+    if (!f->hasAttr(kFuncTargetSpecAttr))
+      return WalkResult::advance();
+
+    ::kun_cuda::KernelMeta km;
+    km.kernelName = f.getSymName().str();
+    if (auto inNames = getFuncInputNames(f))
+      for (auto a : inNames)
+        km.inputNames.push_back(llvm::cast<StringAttr>(a).str());
+    if (auto outNames = getFuncOutputNames(f))
+      for (auto a : outNames)
+        km.outputNames.push_back(llvm::cast<StringAttr>(a).str());
+
+    int64_t w = 1, v = 1;
+    if (auto ts = getFuncTargetSpec(f)) {
+      w = ts.getWarpsPerCta();
+      v = ts.getVectorSize();
     }
+    targetSpecs.emplace_back(w, v);
+    targetSpecOwners.push_back(km.kernelName);
+    kernels.push_back(std::move(km));
     return WalkResult::advance();
   });
-  if (!kernel)
+  if (kernels.empty())
     return module.emitError(
-        "compileKunIrToExecutable: cannot find a llvm.func with kungpu "
-        "metadata in the lowered module");
-
-  out.kernelName = kernel.getSymName().str();
-  if (auto inNames = getFuncInputNames(kernel)) {
-    for (auto a : inNames)
-      out.inputNames.push_back(llvm::cast<StringAttr>(a).str());
-  }
-  if (auto outNames = getFuncOutputNames(kernel)) {
-    for (auto a : outNames)
-      out.outputNames.push_back(llvm::cast<StringAttr>(a).str());
-  }
-  if (auto ts = getFuncTargetSpec(kernel)) {
-    out.warpsPerCta = ts.getWarpsPerCta();
-    out.vectorSize  = ts.getVectorSize();
+        "compileKunIrToExecutable: no llvm.func with kungpu metadata "
+        "found in the lowered module");
+
+  // 3.  Target spec must be uniform across kernels (block / grid config
+  //     is graph-wide in v0).
+  auto [warpsPerCta, vectorSize] = targetSpecs.front();
+  for (size_t i = 1; i < targetSpecs.size(); ++i) {
+    auto [w, v] = targetSpecs[i];
+    if (w != warpsPerCta || v != vectorSize)
+      return module.emitError(
+          "compileKunIrToExecutable: kernels disagree on warps_per_cta / "
+          "vector_size — graph-wide target spec required (")
+          << "kernel '" << targetSpecOwners[i] << "': warps_per_cta="
+          << w << " vector_size=" << v
+          << "; expected warps_per_cta=" << warpsPerCta
+          << " vector_size=" << vectorSize << ")";
   }
 
-  // 3.  Assemble PTX → CUBIN.
+  // 4.  Assemble PTX → CUBIN.
+  std::vector<char> cubin;
   std::string err;
-  if (failed(compilePtxToCubin(ptx, cubinOpts, out.cubin, err)))
+  if (failed(compilePtxToCubin(ptx, cubinOpts, cubin, err)))
     return module.emitError("compileKunIrToExecutable: ") << err;
 
+  // 5.  Populate `out`.  graphInputs / graphOutputs are caller-supplied
+  //     after this returns — leave them empty.
+  out = ::kun_cuda::ExecutableData{};
+  out.cubin       = std::move(cubin);
+  out.warpsPerCta = warpsPerCta;
+  out.vectorSize  = vectorSize;
+  out.kernels     = std::move(kernels);
   return success();
 }
 
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index c3b7d6a..f72a7a0 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -195,9 +195,9 @@ struct CollectedArgs {
 static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
                                    py::dict pyArgs) {
   std::vector<std::string> ordered;
-  ordered.reserve(exe.inputNames().size() + exe.outputNames().size());
-  for (auto &n : exe.inputNames())  ordered.push_back(n);
-  for (auto &n : exe.outputNames()) ordered.push_back(n);
+  ordered.reserve(exe.graphInputs().size() + exe.graphOutputs().size());
+  for (auto &n : exe.graphInputs())  ordered.push_back(n);
+  for (auto &n : exe.graphOutputs()) ordered.push_back(n);
   if (ordered.empty())
     throw std::runtime_error("launch: kernel has no I/O arguments");
 
@@ -236,10 +236,20 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
 }
 
 static std::unique_ptr<kun_cuda::Executable>
-pyCompile(PyModule &pm, const std::string &targetCpu,
+pyCompile(PyModule &pm,
+            const std::vector<std::string> &graphInputs,
+            const std::vector<std::string> &graphOutputs,
+            const std::string &targetCpu,
             const std::string &targetTriple,
             const std::string &targetFeatures, unsigned optLevel,
             unsigned sizeLevel, const std::string &ptxasPath) {
+  if (graphInputs.empty())
+    throw std::runtime_error(
+        "kun_mlir.compile: graph_inputs cannot be empty");
+  if (graphOutputs.empty())
+    throw std::runtime_error(
+        "kun_mlir.compile: graph_outputs cannot be empty");
+
   kungpu::PtxCompileOptions popts;
   if (!targetCpu.empty())      popts.targetCpu      = targetCpu;
   if (!targetTriple.empty())   popts.targetTriple   = targetTriple;
@@ -255,6 +265,10 @@ pyCompile(PyModule &pm, const std::string &targetCpu,
   if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), popts, copts,
                                                 data)))
     throw std::runtime_error("kun_mlir.compile failed");
+  // Graph topology is a runtime concern — fill it in here, just before
+  // handing off to Executable's ctor (which validates + plans).
+  data.graphInputs  = graphInputs;
+  data.graphOutputs = graphOutputs;
   return std::make_unique<kun_cuda::Executable>(std::move(data));
 }
 
@@ -292,11 +306,34 @@ PYBIND11_MODULE(kun_mlir, m) {
          "Assemble PTX → CUBIN via ptxas.  Returns bytes.");
 
   py::class_<kun_cuda::Executable>(m, "Executable")
-      .def_property_readonly("kernel_name",   &kun_cuda::Executable::kernelName)
-      .def_property_readonly("input_names",   &kun_cuda::Executable::inputNames)
-      .def_property_readonly("output_names",  &kun_cuda::Executable::outputNames)
+      .def_property_readonly("input_names",   &kun_cuda::Executable::graphInputs,
+            "Graph-level input names — match this against the keys of the "
+            "args dict you pass to launch().")
+      .def_property_readonly("output_names",  &kun_cuda::Executable::graphOutputs,
+            "Graph-level output names — match this against the keys of the "
+            "args dict you pass to launch().")
       .def_property_readonly("warps_per_cta", &kun_cuda::Executable::warpsPerCta)
       .def_property_readonly("vector_size",   &kun_cuda::Executable::vectorSize)
+      .def_property_readonly("num_kernels",
+            [](const kun_cuda::Executable &e) {
+              return e.numKernels();
+            })
+      .def_property_readonly("kernel_names",
+            [](const kun_cuda::Executable &e) {
+              std::vector<std::string> r;
+              r.reserve(e.data().kernels.size());
+              for (auto &km : e.data().kernels)
+                r.push_back(km.kernelName);
+              return r;
+            })
+      .def_property_readonly("launch_order",  &kun_cuda::Executable::launchOrder,
+            "Topo-sorted indices into kernel_names; the order kernels run "
+            "on the single CUDA stream.")
+      .def_property_readonly("peak_intermediate_slots",
+            &kun_cuda::Executable::peakIntermediateSlots,
+            "Number of intermediate buffers allocated by the runtime — "
+            "shape `(time_length, num_stocks)` each.")
+      .def_property_readonly("num_buffers",   &kun_cuda::Executable::numBuffers)
       .def_property_readonly("cubin",
             [](const kun_cuda::Executable &e) {
               const auto &b = e.data().cubin;
@@ -315,6 +352,8 @@ PYBIND11_MODULE(kun_mlir, m) {
 
   m.def("compile", &pyCompile,
          py::arg("module"),
+         py::arg("graph_inputs"),
+         py::arg("graph_outputs"),
          py::arg("target_cpu")     = "sm_80",
          py::arg("target_triple")  = "nvptx64-nvidia-cuda",
          py::arg("target_features") = "",
@@ -322,5 +361,9 @@ PYBIND11_MODULE(kun_mlir, m) {
          py::arg("size_level")     = 0u,
          py::arg("ptxas_path")     = "",
          "Compile a kunir module all the way to a loaded Executable "
-         "(kunir → LLVM dialect → LLVM IR → PTX → CUBIN → cuModuleLoad).");
+         "(kunir → LLVM dialect → LLVM IR → PTX → CUBIN → cuModuleLoad). "
+         "graph_inputs / graph_outputs name the buffers that flow in/out "
+         "of the whole kernel graph; everything else produced by the "
+         "kernels is treated as an intermediate and gets a runtime-managed "
+         "slot.");
 }
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index fc0cf2d..886cb3c 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -68,14 +68,25 @@ def main() -> int:
     # `mod` was already mutated by lower_to_ptx above; re-parse so compile()
     # gets a fresh kunir.func module.
     mod2 = kun_mlir.parse(SAMPLE_KUNIR)
-    exe = kun_mlir.compile(mod2, target_cpu=args.target, opt_level=3)
-    print(f"  kernel_name   = {exe.kernel_name}")
-    print(f"  input_names   = {exe.input_names}")
-    print(f"  output_names  = {exe.output_names}")
-    print(f"  warps_per_cta = {exe.warps_per_cta}")
-    print(f"  vector_size   = {exe.vector_size}")
-    print(f"  cubin bytes   = {len(exe.cubin)}")
-    assert exe.kernel_name == "test_addsum"
+    exe = kun_mlir.compile(mod2,
+                            graph_inputs=["a", "b"],
+                            graph_outputs=["sum"],
+                            target_cpu=args.target, opt_level=3)
+    print(f"  kernel_names           = {exe.kernel_names}")
+    print(f"  num_kernels            = {exe.num_kernels}")
+    print(f"  launch_order           = {exe.launch_order}")
+    print(f"  num_buffers            = {exe.num_buffers}")
+    print(f"  peak_intermediate_slots= {exe.peak_intermediate_slots}")
+    print(f"  input_names            = {exe.input_names}")
+    print(f"  output_names           = {exe.output_names}")
+    print(f"  warps_per_cta          = {exe.warps_per_cta}")
+    print(f"  vector_size            = {exe.vector_size}")
+    print(f"  cubin bytes            = {len(exe.cubin)}")
+    assert exe.kernel_names == ["test_addsum"]
+    assert exe.num_kernels == 1
+    assert exe.launch_order == [0]
+    assert exe.num_buffers == 3      # a, b, sum
+    assert exe.peak_intermediate_slots == 0  # no intermediates
     assert exe.input_names  == ["a", "b"]
     assert exe.output_names == ["sum"]
     assert exe.warps_per_cta == 4
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
new file mode 100644
index 0000000..6a7302a
--- /dev/null
+++ b/mlir/test/python/test_multi_kernel.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""End-to-end test for the v0 multi-kernel pipeline.
+
+Builds a graph with two kernels chained through one intermediate buffer:
+
+    add_kernel:    tmp = a + b
+    scale_kernel:  out = tmp * c
+
+graph_inputs  = ["a", "b", "c"]
+graph_outputs = ["out"]
+intermediate  = "tmp"  → 1 slot expected
+
+Verifies the compile-time topology / slot plan, then runs the kernels and
+checks the result against numpy.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+import textwrap
+
+import numpy as np
+
+
+SAMPLE_KUNIR = textwrap.dedent("""
+gpu.module @kungpu_kernels {
+  kunir.func @add_kernel(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"tmp"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+
+  kunir.func @scale_kernel(%t: !kunir.ts<f32, inf>, %c: !kunir.ts<f32, inf>)
+      inputs {%t = "tmp", %c = "c"}
+      outputs {"out"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.mul %t, %c : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+}
+""").strip()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    args = ap.parse_args()
+
+    import kun_mlir
+    import cupy as cp
+
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    print("=== compile two-kernel graph ===")
+    mod = kun_mlir.parse(SAMPLE_KUNIR)
+    exe = kun_mlir.compile(mod,
+                            graph_inputs=["a", "b", "c"],
+                            graph_outputs=["out"],
+                            target_cpu=args.target, opt_level=3)
+
+    print(f"  kernel_names           = {exe.kernel_names}")
+    print(f"  num_kernels            = {exe.num_kernels}")
+    print(f"  launch_order           = {exe.launch_order}")
+    print(f"  num_buffers            = {exe.num_buffers}")
+    print(f"  peak_intermediate_slots= {exe.peak_intermediate_slots}")
+    print(f"  input_names            = {exe.input_names}")
+    print(f"  output_names           = {exe.output_names}")
+
+    # Topology checks.
+    assert exe.num_kernels == 2, exe.num_kernels
+    assert set(exe.kernel_names) == {"add_kernel", "scale_kernel"}, exe.kernel_names
+    # Producer (add) must come before consumer (scale).
+    add_pos = exe.launch_order.index(exe.kernel_names.index("add_kernel"))
+    scl_pos = exe.launch_order.index(exe.kernel_names.index("scale_kernel"))
+    assert add_pos < scl_pos, (exe.kernel_names, exe.launch_order)
+    # 3 graph inputs + 1 graph output + 1 intermediate.
+    assert exe.num_buffers == 5, exe.num_buffers
+    # One intermediate ("tmp") → exactly one slot.
+    assert exe.peak_intermediate_slots == 1, exe.peak_intermediate_slots
+    assert exe.input_names  == ["a", "b", "c"]
+    assert exe.output_names == ["out"]
+
+    # === launch ===
+    T, S = args.time_length, args.num_stocks
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    c_h = rng.standard_normal((T, S), dtype=np.float32)
+    a = cp.asarray(a_h)
+    b = cp.asarray(b_h)
+    c = cp.asarray(c_h)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    print()
+    print(f"=== launch ({T} × {S}) ===")
+    exe.launch({"a": a, "b": b, "c": c, "out": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+
+    expected = (a_h + b_h) * c_h
+    if not np.allclose(out_h, expected, atol=1e-5):
+        diff = np.abs(out_h - expected)
+        print(f"  FAIL — max abs diff {diff.max()}, "
+                f"argmax @ {np.unravel_index(diff.argmax(), diff.shape)}",
+                file=sys.stderr)
+        return 1
+
+    print(f"  ok — output matches (a+b)*c on every (t, s) cell ({T*S} cells)")
+
+    # === second launch with different shape — exercises slot pool re-alloc ===
+    T2, S2 = T // 2, S + 64
+    a2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
+    b2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
+    c2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
+    out2 = cp.zeros((T2, S2), dtype=cp.float32)
+
+    print()
+    print(f"=== launch ({T2} × {S2}) — different shape, slot pool re-alloc ===")
+    exe.launch({"a": a2, "b": b2, "c": c2, "out": out2})
+    cp.cuda.runtime.deviceSynchronize()
+    out2_h = cp.asnumpy(out2)
+    expected2 = (cp.asnumpy(a2) + cp.asnumpy(b2)) * cp.asnumpy(c2)
+    if not np.allclose(out2_h, expected2, atol=1e-5):
+        diff = np.abs(out2_h - expected2)
+        print(f"  FAIL — max abs diff {diff.max()}", file=sys.stderr)
+        return 1
+    print(f"  ok — re-launched with new shape, output matches")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index 2904d50..bad7b65 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -89,8 +89,11 @@ def run_one(N: int, expected_placement: str, target: str,
 
     ir = build_ir(N, warps_per_cta=warps_per_cta, smem_size=smem_size)
     mod = kun_mlir.parse(ir)
-    exe = kun_mlir.compile(mod, target_cpu=target, opt_level=3)
-    print(f"  kernel={exe.kernel_name}  warps_per_cta={exe.warps_per_cta}  "
+    exe = kun_mlir.compile(mod,
+                            graph_inputs=["a", "b"],
+                            graph_outputs=["out"],
+                            target_cpu=target, opt_level=3)
+    print(f"  kernels={exe.kernel_names}  warps_per_cta={exe.warps_per_cta}  "
            f"vector_size={exe.vector_size}  cubin={len(exe.cubin)} bytes")
 
     # Random input.  T must be > N so we have at least one valid window.

From 89b6645dd431ab76c3edfd02adc2965b971dc588 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 7 May 2026 21:29:36 -0700
Subject: [PATCH 11/59] pybind

---
 KunQuant/jit/cuda.py                   | 143 +++++++++++
 KunQuant/passes/CodegenMLIR.py         | 225 +++++++++++++++++
 mlir/lib/Python/CMakeLists.txt         |   5 +-
 mlir/lib/Python/IRBuilder.cpp          | 333 +++++++++++++++++++++++++
 mlir/lib/Python/IRBuilder.h            |  28 +++
 mlir/lib/Python/MlirBinding.cpp        |  92 ++-----
 mlir/lib/Python/PyModule.h             |  86 +++++++
 mlir/test/python/test_kun_mlir.py      |   4 +-
 mlir/test/python/test_kun_to_cuda.py   | 149 +++++++++++
 mlir/test/python/test_multi_kernel.py  |   2 +-
 mlir/test/python/test_windowed_temp.py |   2 +-
 11 files changed, 988 insertions(+), 81 deletions(-)
 create mode 100644 KunQuant/jit/cuda.py
 create mode 100644 KunQuant/passes/CodegenMLIR.py
 create mode 100644 mlir/lib/Python/IRBuilder.cpp
 create mode 100644 mlir/lib/Python/IRBuilder.h
 create mode 100644 mlir/lib/Python/PyModule.h
 create mode 100644 mlir/test/python/test_kun_to_cuda.py

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
new file mode 100644
index 0000000..6d7402f
--- /dev/null
+++ b/KunQuant/jit/cuda.py
@@ -0,0 +1,143 @@
+"""GPU JIT entry point for KunQuant.
+
+Mirror of `KunQuant.jit.cfake.compileit` but targets a CUDA backend
+through the kun_mlir / kunir pipeline.  Reuses the existing Driver pass
+list (`Driver.optimize`) so any IR rewrites the CPU path benefits from
+also apply here — only the codegen layer is replaced.
+
+User entry point::
+
+    from KunQuant.jit.cuda import compileit, CudaCompilerConfig
+
+    exe = compileit(f, CudaCompilerConfig(gpu_arch="sm_80"))
+    exe.launch({"a": cp_a, "b": cp_b, "out": cp_out})
+
+Scope (v0):
+  * Single Function in, single kunir.func out.  Multi-Function /
+    auto-partition support is future work.
+  * dtype = "float" only (kunir lowers f32 today).
+  * Layout is implicit: kunir uses the TS-major layout exposed by the
+    runtime (see KunCuda/Runtime.h).
+"""
+
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+
+import kun_mlir
+
+from KunQuant.Driver import optimize
+from KunQuant.Stage import Function
+from KunQuant.passes.CodegenMLIR import TargetSpec, translate_function
+
+
+@dataclass
+class CudaCompilerConfig:
+    """Mirrors the parts of KunCompilerConfig that matter for GPU.
+
+    `dtype`, `gpu_arch`, and the kunir target_spec fields are the only
+    knobs we actually expose.  The CPU-only fields (blocking_len,
+    input_layout, etc.) deliberately do not appear here — they're not
+    meaningful for the GPU path.
+    """
+    gpu_arch:    str = "sm_80"
+    dtype:         str = "float"   # only "float" supported in v0
+
+    # kunir.target_spec — graph-wide for v0.
+    occupancy:     int = 1
+    warps_per_cta: int = 4
+    smem_size:     int = 49152
+    vector_size:   int = 1
+
+    # ptx → cubin
+    opt_level:     int  = 3
+    ptxas_path:    str  = ""
+
+    # Pass-list options forwarded to optimize().  We seed reasonable GPU
+    # defaults; user-supplied keys override.
+    options:       Optional[dict] = None
+
+
+def _gpu_pass_options(cfg: CudaCompilerConfig) -> dict:
+    """Defaults for `Driver.optimize`'s `options` dict on the GPU path.
+
+    The CPU compileit() does the same kind of seeding — we replicate the
+    bits that affect graph rewriting.  `blocking_len` is needed by some
+    decompose paths (skip-list cutoff in WindowedMin/Max); we feed it
+    `warps_per_cta * 32 * vector_size`, which matches the GPU's
+    stocks-per-block.
+    """
+    opts: dict = {
+        "blocking_len":   cfg.warps_per_cta * 32 * cfg.vector_size,
+        # Fast-stat tricks rely on running stats / FMA orderings that
+        # don't map cleanly onto the GPU primitives we lower today.
+        # Keep it off until the corresponding kunir lowerings exist.
+        "no_fast_stat":   True,
+        # opt_reduce rewrites WindowedSum etc. into the stateful
+        # FastWindowedSum op, which kunir doesn't have a counterpart
+        # for yet — keep the canonical ForeachBackWindow + Reduce shape.
+        "opt_reduce":     False,
+    }
+    if cfg.options:
+        opts.update(cfg.options)
+    return opts
+
+
+def _to_dtype_token(dtype: str) -> str:
+    if dtype == "float":  return "f32"
+    if dtype == "double": return "f64"
+    raise ValueError(f"compile_to_cuda: unsupported dtype '{dtype}' "
+                       f"(supported: float, double — kunir today only "
+                       f"lowers float on GPU)")
+
+
+def compileit(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.Executable:
+    """Compile a single KunQuant Function to a GPU `kun_mlir.Executable`.
+
+    The Function is mutated in place by Driver.optimize() (same as the
+    CPU path).  Inputs/Outputs declared via `Input(name)` / `Output(...,
+    name)` become the resulting Executable's graph_inputs / graph_outputs.
+    """
+    if cfg.dtype not in ("float", "double"):
+        raise ValueError(
+            f"CudaCompilerConfig.dtype must be 'float' or 'double', got "
+            f"{cfg.dtype!r}")
+
+    # 1.  Same optimizer pipeline the CPU path runs.  This is where
+    #     WindowedSum etc. decompose into ForeachBackWindow + Reduce.
+    options = _gpu_pass_options(cfg)
+    optimize(f, options)
+
+    # 2.  Translate the post-optimize IR to a kun_mlir module.
+    target = TargetSpec(occupancy=cfg.occupancy,
+                          warps_per_cta=cfg.warps_per_cta,
+                          smem_size=cfg.smem_size,
+                          vector_size=cfg.vector_size)
+    ir = kun_mlir.IRBuilder()
+    in_names, out_names = translate_function(
+        f, target, ir, dtype=_to_dtype_token(cfg.dtype))
+    mod = ir.finish()
+
+    # 3.  Hand off to the kun_mlir compile pipeline.
+    return kun_mlir.compile(
+        mod,
+        graph_inputs=in_names,
+        graph_outputs=out_names,
+        gpu_arch=cfg.gpu_arch,
+        opt_level=cfg.opt_level,
+        ptxas_path=cfg.ptxas_path,
+    )
+
+
+def to_mlir(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.ModuleOp:
+    """Run the same passes + translator as `compileit`, but return the
+    kun_mlir module before PTX/CUBIN.  Useful for debugging the IR."""
+    options = _gpu_pass_options(cfg)
+    optimize(f, options)
+    target = TargetSpec(occupancy=cfg.occupancy,
+                          warps_per_cta=cfg.warps_per_cta,
+                          smem_size=cfg.smem_size,
+                          vector_size=cfg.vector_size)
+    ir = kun_mlir.IRBuilder()
+    translate_function(f, target, ir, dtype=_to_dtype_token(cfg.dtype))
+    return ir.finish()
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
new file mode 100644
index 0000000..7e24066
--- /dev/null
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -0,0 +1,225 @@
+"""Translate a (post-optimize) KunQuant Function into a kun_mlir module
+holding a single kunir.func inside a gpu.module.
+
+This is the GPU-side counterpart to passes.CodegenCpp.codegen_cpp; it
+runs after the same Driver.optimize() pipeline the CPU path uses, then
+walks the lowered IR and emits kunir ops via the kun_mlir.IRBuilder
+pybind class.
+
+Scope (v0): only the ops kunir currently supports.
+  - Elemwise binary: Add, Sub, Mul, Div, Max, Min
+  - Elemwise unary:  Abs, Log, Sign
+  - Cross-sectional: Rank
+  - Windowed:        WindowedTempOutput, ForeachBackWindow + IterValue,
+                      ReduceAdd / ReduceMul / ReduceMax / ReduceMin
+  - Boundaries:      Input, Output
+
+Anything else raises NotImplementedError with the offending op printed.
+"""
+
+from __future__ import annotations
+from typing import Dict, List, Tuple
+
+from KunQuant.Op import (
+    OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
+    ReductionOp, Rank,
+)
+from KunQuant.ops.ElewiseOp import (
+    Add, Sub, Mul, Div, Max, Min, Abs, Log, Sign,
+)
+from KunQuant.ops.ReduceOp import (
+    ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
+)
+from KunQuant.Stage import Function
+
+
+# ── Op-class → IRBuilder method dispatch ────────────────────────────
+
+_BINARY = {
+    Add: "add", Sub: "sub", Mul: "mul", Div: "div",
+    Max: "max", Min: "min",
+}
+_UNARY = {
+    Abs: "abs", Log: "log", Sign: "sign", Rank: "cs_rank",
+}
+_REDUCE = {
+    ReduceAdd: "reduce_add", ReduceMul: "reduce_mul",
+    ReduceMax: "reduce_max", ReduceMin: "reduce_min",
+}
+
+
+# ── Target spec carrier ─────────────────────────────────────────────
+
+class TargetSpec:
+    """GPU launch parameters mirrored from kunir.target_spec."""
+    def __init__(self, *, occupancy: int = 1, warps_per_cta: int = 4,
+                 smem_size: int = 49152, vector_size: int = 1):
+        self.occupancy     = occupancy
+        self.warps_per_cta = warps_per_cta
+        self.smem_size     = smem_size
+        self.vector_size   = vector_size
+
+
+# ── Helpers ─────────────────────────────────────────────────────────
+
+def _index_loop_members(f: Function) -> Tuple[
+        Dict[ForeachBackWindow, List[OpBase]],
+        Dict[ForeachBackWindow, List[ReductionOp]]]:
+    """For each ForeachBackWindow in `f`, collect the body ops (those
+    whose `_parent_loop` is the loop) and the reduction ops (whose
+    `get_loop()` is the loop).  Both lists keep f.ops topo order."""
+    body_ops: Dict[ForeachBackWindow, List[OpBase]] = {}
+    reductions: Dict[ForeachBackWindow, List[ReductionOp]] = {}
+    for op in f.ops:
+        if isinstance(op, ReductionOp):
+            loop = op.get_loop()
+            reductions.setdefault(loop, []).append(op)
+        elif op.get_parent() is not None:
+            body_ops.setdefault(op.get_parent(), []).append(op)
+    return body_ops, reductions
+
+
+def _emit_simple(op: OpBase, ir, val_map: Dict[OpBase, object]):
+    """Emit a non-control-flow op via IRBuilder dispatch."""
+    cls = type(op)
+    if cls in _BINARY:
+        getattr(ir, _BINARY[cls])
+        return getattr(ir, _BINARY[cls])(val_map[op.inputs[0]],
+                                           val_map[op.inputs[1]])
+    if cls in _UNARY:
+        return getattr(ir, _UNARY[cls])(val_map[op.inputs[0]])
+    if isinstance(op, WindowedTempOutput):
+        return ir.windowed_output(val_map[op.inputs[0]],
+                                    int(op.attrs["window"]))
+    raise NotImplementedError(
+        f"CodegenMLIR: op type {cls.__name__} is not supported by the "
+        f"GPU backend yet (op = {op})")
+
+
+def _emit_reduction(op: ReductionOp, ir, val_map: Dict[OpBase, object]):
+    cls = type(op)
+    if cls not in _REDUCE:
+        raise NotImplementedError(
+            f"CodegenMLIR: reduction {cls.__name__} not supported yet "
+            f"(op = {op})")
+    if len(op.inputs) != 1:
+        raise NotImplementedError(
+            f"CodegenMLIR: reductions with init_val are not supported "
+            f"yet (op = {op})")
+    return getattr(ir, _REDUCE[cls])(val_map[op.inputs[0]])
+
+
+# ── Main entry point ────────────────────────────────────────────────
+
+def translate_function(f: Function, target: TargetSpec, ir,
+                        dtype: str = "f32"):
+    """Emit `f` as a single kunir.func into the open `ir` (kun_mlir.IRBuilder).
+
+    Returns the list of (input_name, output_name) declared on the func,
+    so the caller can pass them straight to kun_mlir.compile() as
+    graph_inputs / graph_outputs.
+    """
+    # 1.  Boundary ops in topo order — the kunir.func's I/O.
+    inputs:  List[Input]  = [op for op in f.ops if isinstance(op, Input)]
+    outputs: List[Output] = [op for op in f.ops if isinstance(op, Output)]
+    if not inputs:
+        raise ValueError("CodegenMLIR: function has no Input ops")
+    if not outputs:
+        raise ValueError("CodegenMLIR: function has no Output ops")
+
+    in_names  = [op.attrs["name"] for op in inputs]
+    out_names = [op.attrs["name"] for op in outputs]
+
+    # 2.  Pre-index loop members so we can emit each loop's body +
+    #     reductions contiguously (regardless of topo interleaving with
+    #     other loops).
+    body_ops_by_loop, reductions_by_loop = _index_loop_members(f)
+
+    # 3.  Open the kunir.func.  All inputs are ts<dtype, inf>; all
+    #     graph results are ts<dtype, 1>.
+    ts_inf = ir.ts_type(dtype, 0)
+    ts_1   = ir.ts_type(dtype, 1)
+
+    func_args = ir.begin_func(
+        name=f.name or "kernel",
+        input_types=[ts_inf] * len(inputs),
+        input_names=in_names,
+        output_names=out_names,
+        occupancy=target.occupancy, warps_per_cta=target.warps_per_cta,
+        smem_size=target.smem_size, vector_size=target.vector_size,
+        result_types=[ts_1] * len(outputs),
+    )
+
+    val_map: Dict[OpBase, object] = {}
+    emitted = set()
+    for inp, val in zip(inputs, func_args):
+        val_map[inp] = val
+        emitted.add(inp)
+
+    # 4.  Walk f.ops in topo order, emitting one op (or one whole loop)
+    #     at a time.
+    for op in f.ops:
+        if op in emitted:
+            continue
+        if isinstance(op, Input):
+            continue                      # already mapped from func_args
+        if isinstance(op, Output):
+            continue                      # handled at the end via Return
+        if isinstance(op, ForeachBackWindow):
+            _emit_loop(op, ir, val_map, ts_1,
+                        body_ops_by_loop.get(op, []),
+                        reductions_by_loop.get(op, []),
+                        emitted)
+            continue
+        if isinstance(op, ReductionOp) or op.get_parent() is not None:
+            # Should have been emitted as part of its enclosing loop;
+            # if we hit it here, the loop never appeared first — that's
+            # a bug in topo sort or in this translator's iteration.
+            raise RuntimeError(
+                f"CodegenMLIR: reduction/body op visited before its "
+                f"enclosing loop ({op})")
+        val_map[op] = _emit_simple(op, ir, val_map)
+        emitted.add(op)
+
+    # 5.  Close the function with Outputs in declared order.
+    return_values = [val_map[o.inputs[0]] for o in outputs]
+    ir.end_func(return_values)
+    return in_names, out_names
+
+
+def _emit_loop(loop: ForeachBackWindow, ir, val_map, ts_1,
+                body_ops: List[OpBase], reductions: List[ReductionOp],
+                emitted: set):
+    loop_input_vals = [val_map[i] for i in loop.inputs]
+    n_results = len(reductions)
+    if n_results == 0:
+        raise NotImplementedError(
+            f"CodegenMLIR: ForeachBackWindow with no reductions "
+            f"(loop = {loop})")
+
+    block_args = ir.begin_for_each_back_window(
+        inputs=loop_input_vals,
+        window=int(loop.attrs["window"]),
+        result_types=[ts_1] * n_results,
+    )
+    # Block args mirror loop.inputs positionally.  Map the source-op
+    # → block-arg so IterValue can be resolved to the right one.
+    block_arg_by_src = {src: block_args[i]
+                          for i, src in enumerate(loop.inputs)}
+
+    # Body ops: IterValue → block arg; everything else uses _emit_simple.
+    for body_op in body_ops:
+        if isinstance(body_op, IterValue):
+            val_map[body_op] = block_arg_by_src[body_op.inputs[1]]
+        else:
+            val_map[body_op] = _emit_simple(body_op, ir, val_map)
+        emitted.add(body_op)
+
+    # Reductions accumulate yield values, in topo order.
+    yield_vals = [_emit_reduction(r, ir, val_map) for r in reductions]
+    loop_results = ir.end_for_each_back_window(yield_vals)
+    for r, lr in zip(reductions, loop_results):
+        val_map[r] = lr
+        emitted.add(r)
+
+    emitted.add(loop)
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index 8901448..91a9bd7 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -10,7 +10,10 @@
 string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
 string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
 
-pybind11_add_module(kun_mlir SHARED MlirBinding.cpp)
+pybind11_add_module(kun_mlir SHARED
+  MlirBinding.cpp
+  IRBuilder.cpp
+)
 
 # Co-locate the binding with libKunCudaRuntime.so so $ORIGIN
 # (CMAKE_INSTALL_RPATH set at top level) resolves dependencies as
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
new file mode 100644
index 0000000..8df163c
--- /dev/null
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -0,0 +1,333 @@
+//===- IRBuilder.cpp - Programmatic kunir module construction from Python ===//
+
+#include "IRBuilder.h"
+#include "PyModule.h"
+
+#include <pybind11/stl.h>
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace py = pybind11;
+using namespace mlir;
+
+namespace kun_mlir_py {
+
+namespace {
+
+/// Stateful kunir builder.  Holds an MLIRContext + ModuleOp (via PyModule),
+/// a current OpBuilder insertion point, and a stack used by
+/// for_each_back_window's region nesting.
+class IRBuilder {
+public:
+  IRBuilder()
+      : pm_(std::make_unique<PyModule>()), b_(pm_->ctx.get()) {
+    Location loc = b_.getUnknownLoc();
+    pm_->module = OwningOpRef<ModuleOp>(ModuleOp::create(loc));
+    b_.setInsertionPointToEnd(pm_->module.get().getBody());
+    // One gpu.module per IRBuilder — kun_mlir's pipeline expects exactly
+    // one container for all kunir.func ops.
+    gpuMod_ = b_.create<gpu::GPUModuleOp>(loc, "kungpu_kernels");
+    b_.setInsertionPointToStart(&gpuMod_.getBodyRegion().front());
+  }
+
+  // ── Type construction ─────────────────────────────────────────────
+  Type tsType(const std::string &elemDtype, int64_t lookback) {
+    Type elem;
+    if (elemDtype == "f32" || elemDtype == "float")
+      elem = b_.getF32Type();
+    else if (elemDtype == "f64" || elemDtype == "double")
+      elem = b_.getF64Type();
+    else
+      throw std::runtime_error("IRBuilder.ts_type: unsupported elem dtype '" +
+                                 elemDtype + "' (expected f32/f64)");
+    uint64_t lb = lookback == 0 ? std::numeric_limits<uint64_t>::max()
+                                  : static_cast<uint64_t>(lookback);
+    return kunir::TsType::get(pm_->ctx.get(), elem, lb);
+  }
+
+  // ── Function ──────────────────────────────────────────────────────
+  std::vector<Value>
+  beginFunc(const std::string &name,
+              std::vector<Type> inputTypes,
+              std::vector<std::string> inputNames,
+              std::vector<std::string> outputNames,
+              int64_t occupancy, int64_t warpsPerCta,
+              int64_t smemSize, int64_t vectorSize,
+              std::vector<Type> resultTypes) {
+    if (curFunc_)
+      throw std::runtime_error(
+          "IRBuilder.begin_func: a function is already open — call "
+          "end_func() first");
+    if (inputTypes.size() != inputNames.size())
+      throw std::runtime_error(
+          "IRBuilder.begin_func: input_types and input_names must have "
+          "the same length");
+    if (resultTypes.size() != outputNames.size())
+      throw std::runtime_error(
+          "IRBuilder.begin_func: result_types and output_names must have "
+          "the same length (non-void form: outputs become result types)");
+
+    // Restore insertion point to the gpu.module body before starting a
+    // new function (in case end_func left us at module scope already).
+    b_.setInsertionPointToEnd(&gpuMod_.getBodyRegion().front());
+
+    MLIRContext *ctx = pm_->ctx.get();
+    Location loc = b_.getUnknownLoc();
+
+    auto funcType = b_.getFunctionType(inputTypes, resultTypes);
+    auto inNamesAttr = b_.getArrayAttr(llvm::map_to_vector(
+        inputNames,
+        [&](const std::string &s) -> Attribute { return b_.getStringAttr(s); }));
+    auto outNamesAttr = b_.getArrayAttr(llvm::map_to_vector(
+        outputNames,
+        [&](const std::string &s) -> Attribute { return b_.getStringAttr(s); }));
+    auto target = kunir::TargetSpecAttr::get(ctx, occupancy, warpsPerCta,
+                                                smemSize, vectorSize);
+
+    curFunc_ = b_.create<kunir::FuncOp>(loc, name, funcType, inNamesAttr,
+                                          outNamesAttr, target);
+    Block &entry = curFunc_.getBodyBlock();
+    b_.setInsertionPointToStart(&entry);
+
+    std::vector<Value> args(entry.args_begin(), entry.args_end());
+    return args;
+  }
+
+  void endFunc(std::vector<Value> returnValues) {
+    if (!curFunc_)
+      throw std::runtime_error(
+          "IRBuilder.end_func: no open function — call begin_func() first");
+    if (!loopStack_.empty())
+      throw std::runtime_error(
+          "IRBuilder.end_func: " + std::to_string(loopStack_.size()) +
+          " for_each_back_window region(s) still open — close them first");
+
+    Location loc = b_.getUnknownLoc();
+    b_.create<kunir::ReturnOp>(loc, ValueRange(returnValues));
+
+    // Restore insertion point to gpu.module so the next begin_func
+    // appends a sibling.
+    b_.setInsertionPointToEnd(&gpuMod_.getBodyRegion().front());
+    curFunc_ = nullptr;
+  }
+
+  // ── Elemwise ops (InferTypeOpInterface — no result type needed) ──
+  Value addOp(Value a, Value b) { return makeBin<kunir::AddOp>(a, b); }
+  Value subOp(Value a, Value b) { return makeBin<kunir::SubOp>(a, b); }
+  Value mulOp(Value a, Value b) { return makeBin<kunir::MulOp>(a, b); }
+  Value divOp(Value a, Value b) { return makeBin<kunir::DivOp>(a, b); }
+  Value maxOp(Value a, Value b) { return makeBin<kunir::MaxOp>(a, b); }
+  Value minOp(Value a, Value b) { return makeBin<kunir::MinOp>(a, b); }
+
+  Value absOp(Value x)  { return makeUn<kunir::AbsOp>(x); }
+  Value logOp(Value x)  { return makeUn<kunir::LogOp>(x); }
+  Value signOp(Value x) { return makeUn<kunir::SignOp>(x); }
+  Value csRankOp(Value x) { return makeUn<kunir::CsRankOp>(x); }
+
+  // ── Windowed buffer materialization ───────────────────────────────
+  Value windowedOutputOp(Value x, int64_t length) {
+    auto inTs = llvm::cast<kunir::TsType>(x.getType());
+    auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(),
+                                          static_cast<uint64_t>(length));
+    return b_.create<kunir::WindowedOutputOp>(b_.getUnknownLoc(), resultTy, x,
+                                                length);
+  }
+
+  // ── For-each-back-window region ───────────────────────────────────
+  std::vector<Value>
+  beginForEachBackWindow(std::vector<Value> inputs, int64_t window,
+                            std::vector<Type> resultTypes) {
+    Location loc = b_.getUnknownLoc();
+    auto loopOp = b_.create<kunir::ForEachBackWindowOp>(loc, resultTypes,
+                                                          inputs, window);
+    // Populate body block: one block arg per input, each ts<elemType, 1>.
+    Block *body = new Block;
+    for (Value in : inputs) {
+      auto ts = llvm::cast<kunir::TsType>(in.getType());
+      body->addArgument(
+          kunir::TsType::get(pm_->ctx.get(), ts.getElementType(), 1), loc);
+    }
+    loopOp.getBody().push_back(body);
+
+    // Descend into the body; remember where to resume.
+    ipStack_.push_back(b_.saveInsertionPoint());
+    loopStack_.push_back(loopOp);
+    b_.setInsertionPointToStart(body);
+
+    return std::vector<Value>(body->args_begin(), body->args_end());
+  }
+
+  std::vector<Value>
+  endForEachBackWindow(std::vector<Value> yieldValues) {
+    if (loopStack_.empty())
+      throw std::runtime_error(
+          "IRBuilder.end_for_each_back_window: no open loop");
+    Location loc = b_.getUnknownLoc();
+    b_.create<kunir::YieldOp>(loc, ValueRange(yieldValues));
+
+    auto loopOp = loopStack_.back();
+    loopStack_.pop_back();
+    b_.restoreInsertionPoint(ipStack_.back());
+    ipStack_.pop_back();
+
+    return std::vector<Value>(loopOp.getResults().begin(),
+                                loopOp.getResults().end());
+  }
+
+  // ── Reductions (must be inside a loop body) ───────────────────────
+  Value reduceAddOp(Value x) { return makeReduce<kunir::ReduceAddOp>(x); }
+  Value reduceMulOp(Value x) { return makeReduce<kunir::ReduceMulOp>(x); }
+  Value reduceMaxOp(Value x) { return makeReduce<kunir::ReduceMaxOp>(x); }
+  Value reduceMinOp(Value x) { return makeReduce<kunir::ReduceMinOp>(x); }
+
+  // ── Finalize ──────────────────────────────────────────────────────
+  std::unique_ptr<PyModule> finish() {
+    if (curFunc_)
+      throw std::runtime_error(
+          "IRBuilder.finish: a function is still open — call end_func() "
+          "first");
+    if (!loopStack_.empty())
+      throw std::runtime_error(
+          "IRBuilder.finish: for_each_back_window region(s) still open");
+    return std::move(pm_);
+  }
+
+  std::string toString() const {
+    if (!pm_)
+      throw std::runtime_error(
+          "IRBuilder.to_string: builder has been consumed by finish()");
+    return pm_->toString();
+  }
+
+private:
+  template <typename OpTy> Value makeBin(Value a, Value b) {
+    return b_.create<OpTy>(b_.getUnknownLoc(), a, b);
+  }
+  template <typename OpTy> Value makeUn(Value x) {
+    return b_.create<OpTy>(b_.getUnknownLoc(), x);
+  }
+  template <typename OpTy> Value makeReduce(Value x) {
+    // SameOperandsAndResultType — pass x's type as the result type.
+    return b_.create<OpTy>(b_.getUnknownLoc(), x.getType(), x);
+  }
+
+  std::unique_ptr<PyModule> pm_;
+  OpBuilder b_;
+  gpu::GPUModuleOp gpuMod_;
+  kunir::FuncOp curFunc_;
+  std::vector<OpBuilder::InsertPoint> ipStack_;
+  std::vector<kunir::ForEachBackWindowOp> loopStack_;
+};
+
+std::string valueRepr(Value v) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  if (v) v.print(os);
+  else   os << "<null Value>";
+  return s;
+}
+
+std::string typeRepr(Type t) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  if (t) t.print(os);
+  else   os << "<null Type>";
+  return s;
+}
+
+} // namespace
+
+void registerIRBuilder(py::module &m) {
+  // Opaque MLIR Value / Type wrappers.  No mutating methods — just an
+  // identity / repr.  They live as long as the IRBuilder + its resulting
+  // PyModule.
+  py::class_<Value>(m, "Value")
+      .def("__repr__", [](Value v) { return "<kun_mlir.Value " + valueRepr(v) + ">"; })
+      .def("__str__",  [](Value v) { return valueRepr(v); });
+
+  py::class_<Type>(m, "Type")
+      .def("__repr__", [](Type t) { return "<kun_mlir.Type " + typeRepr(t) + ">"; })
+      .def("__str__",  [](Type t) { return typeRepr(t); });
+
+  py::class_<IRBuilder>(m, "IRBuilder",
+        "Stateful builder that constructs a kunir module programmatically.\n"
+        "Wrap your translator around this — it's the canonical alternative "
+        "to round-tripping through MLIR text via parse().")
+      .def(py::init<>())
+
+      // Type
+      .def("ts_type", &IRBuilder::tsType,
+            py::arg("elem_dtype"), py::arg("lookback"),
+            "Build a !kunir.ts<elem_dtype, lookback>.  lookback==0 → 'inf'.")
+
+      // Function
+      .def("begin_func", &IRBuilder::beginFunc,
+            py::arg("name"),
+            py::arg("input_types"), py::arg("input_names"),
+            py::arg("output_names"),
+            py::arg("occupancy"), py::arg("warps_per_cta"),
+            py::arg("smem_size"), py::arg("vector_size"),
+            py::arg("result_types"),
+            "Open a new kunir.func.  Returns its argument Values.")
+      .def("end_func", &IRBuilder::endFunc, py::arg("return_values"),
+            "Close the current kunir.func with a kunir.return.")
+
+      // Elemwise
+      .def("add",    &IRBuilder::addOp,    py::arg("lhs"), py::arg("rhs"))
+      .def("sub",    &IRBuilder::subOp,    py::arg("lhs"), py::arg("rhs"))
+      .def("mul",    &IRBuilder::mulOp,    py::arg("lhs"), py::arg("rhs"))
+      .def("div",    &IRBuilder::divOp,    py::arg("lhs"), py::arg("rhs"))
+      .def("max",    &IRBuilder::maxOp,    py::arg("lhs"), py::arg("rhs"))
+      .def("min",    &IRBuilder::minOp,    py::arg("lhs"), py::arg("rhs"))
+      .def("abs",    &IRBuilder::absOp,    py::arg("x"))
+      .def("log",    &IRBuilder::logOp,    py::arg("x"))
+      .def("sign",   &IRBuilder::signOp,   py::arg("x"))
+      .def("cs_rank", &IRBuilder::csRankOp, py::arg("x"))
+
+      // Windowed materialization
+      .def("windowed_output", &IRBuilder::windowedOutputOp,
+            py::arg("x"), py::arg("length"))
+
+      // Loop
+      .def("begin_for_each_back_window", &IRBuilder::beginForEachBackWindow,
+            py::arg("inputs"), py::arg("window"), py::arg("result_types"),
+            "Open a for_each_back_window region.  Returns block args (one "
+            "per loop input, type ts<elem,1>).")
+      .def("end_for_each_back_window", &IRBuilder::endForEachBackWindow,
+            py::arg("yield_values"),
+            "Close the current for_each_back_window with a kunir.yield, "
+            "returning the loop op's results.")
+
+      // Reductions
+      .def("reduce_add", &IRBuilder::reduceAddOp, py::arg("x"))
+      .def("reduce_mul", &IRBuilder::reduceMulOp, py::arg("x"))
+      .def("reduce_max", &IRBuilder::reduceMaxOp, py::arg("x"))
+      .def("reduce_min", &IRBuilder::reduceMinOp, py::arg("x"))
+
+      // Finalize / debug
+      .def("to_string", &IRBuilder::toString,
+            "Print the module under construction (for debugging — does "
+            "not consume the builder).")
+      .def("finish", &IRBuilder::finish,
+            "Hand off the module to a kun_mlir.ModuleOp.  Builder is "
+            "consumed.");
+}
+
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/IRBuilder.h b/mlir/lib/Python/IRBuilder.h
new file mode 100644
index 0000000..b8c17ae
--- /dev/null
+++ b/mlir/lib/Python/IRBuilder.h
@@ -0,0 +1,28 @@
+//===- IRBuilder.h - Programmatic kunir module construction from Python ---===//
+//
+// Exposes a stateful builder to Python so a translator (e.g. KunQuant's
+// codegen pass) can emit kunir ops without going through textual MLIR.
+//
+// Lifecycle:
+//   ir = kun_mlir.IRBuilder()
+//   ir.begin_func(name, in_types, in_names, out_names, target_spec, result_types)
+//   args = ir.func_args
+//   v = ir.add(args[0], args[1])
+//   ir.end_func([v])
+//   ...
+//   mod = ir.finish()                # → kun_mlir.ModuleOp
+//
+// `Value` and `Type` are opaque wrappers around mlir::Value / mlir::Type.
+// They are valid only while the IRBuilder (and the resulting PyModule)
+// are alive.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace kun_mlir_py {
+/// Register the IRBuilder + Value + Type pybind classes on `m`.
+void registerIRBuilder(::pybind11::module &m);
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index f72a7a0..0908a76 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -13,32 +13,13 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/DialectRegistry.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Support/LLVM.h"
-
-// Dialect registrations
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Index/IR/IndexDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "PyModule.h"     // shared MLIRContext + ModuleOp wrapper
+#include "IRBuilder.h"    // pybind class for programmatic kunir construction
 
 #include "KunCuda/Runtime.h"
-#include "KunGpu/KunGpuDialect.h"
 #include "KunGpu/PtxBackend.h"
-#include "KunIr/KunIrDialect.h"
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
 
 #include <memory>
 #include <sstream>
@@ -48,67 +29,23 @@
 
 namespace py = pybind11;
 
-namespace {
-
-//===----------------------------------------------------------------------===//
-// MLIR module wrapper
-//===----------------------------------------------------------------------===//
-
-class PyModule {
-public:
-  PyModule()
-      : ctx(std::make_unique<mlir::MLIRContext>(makeRegistry(),
-                                                  mlir::MLIRContext::Threading::DISABLED)) {
-    ctx->loadAllAvailableDialects();
-  }
-
-  static mlir::DialectRegistry makeRegistry() {
-    mlir::DialectRegistry registry;
-    registry.insert<mlir::arith::ArithDialect>();
-    registry.insert<mlir::cf::ControlFlowDialect>();
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<mlir::gpu::GPUDialect>();
-    registry.insert<mlir::index::IndexDialect>();
-    registry.insert<mlir::LLVM::LLVMDialect>();
-    registry.insert<mlir::math::MathDialect>();
-    registry.insert<mlir::NVVM::NVVMDialect>();
-    registry.insert<mlir::scf::SCFDialect>();
-    registry.insert<kunir::KunIrDialect>();
-    registry.insert<kungpu::KunGpuDialect>();
-    return registry;
-  }
+using kun_mlir_py::PyModule;
 
-  static std::unique_ptr<PyModule> parse(const std::string &text) {
-    auto pm = std::make_unique<PyModule>();
-    pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
-    if (!pm->module)
-      throw std::runtime_error("kun_mlir.parse: failed to parse MLIR text");
-    return pm;
-  }
-
-  std::string toString() const {
-    std::string out;
-    llvm::raw_string_ostream os(out);
-    module.get().print(os);
-    os.flush();
-    return out;
-  }
-
-  std::unique_ptr<mlir::MLIRContext> ctx;
-  mlir::OwningOpRef<mlir::ModuleOp> module;
-};
+namespace {
 
 //===----------------------------------------------------------------------===//
 // One-shot helpers
 //===----------------------------------------------------------------------===//
 
-static std::string pyLowerToPtx(PyModule &pm, const std::string &targetCpu,
+static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
                                   const std::string &targetTriple,
                                   const std::string &targetFeatures,
                                   unsigned optLevel,
                                   unsigned sizeLevel) {
   kungpu::PtxCompileOptions opts;
-  if (!targetCpu.empty())      opts.targetCpu      = targetCpu;
+  // `targetCpu` is what LLVM's TargetMachine API calls the SM arch — for
+  // NVPTX the "CPU" string IS the GPU compute capability ("sm_80" etc.).
+  if (!gpuArch.empty())        opts.targetCpu      = gpuArch;
   if (!targetTriple.empty())   opts.targetTriple   = targetTriple;
   if (!targetFeatures.empty()) opts.targetFeatures = targetFeatures;
   opts.optLevel  = optLevel;
@@ -239,7 +176,7 @@ static std::unique_ptr<kun_cuda::Executable>
 pyCompile(PyModule &pm,
             const std::vector<std::string> &graphInputs,
             const std::vector<std::string> &graphOutputs,
-            const std::string &targetCpu,
+            const std::string &gpuArch,
             const std::string &targetTriple,
             const std::string &targetFeatures, unsigned optLevel,
             unsigned sizeLevel, const std::string &ptxasPath) {
@@ -251,14 +188,14 @@ pyCompile(PyModule &pm,
         "kun_mlir.compile: graph_outputs cannot be empty");
 
   kungpu::PtxCompileOptions popts;
-  if (!targetCpu.empty())      popts.targetCpu      = targetCpu;
+  if (!gpuArch.empty())        popts.targetCpu      = gpuArch;
   if (!targetTriple.empty())   popts.targetTriple   = targetTriple;
   if (!targetFeatures.empty()) popts.targetFeatures = targetFeatures;
   popts.optLevel  = optLevel;
   popts.sizeLevel = sizeLevel;
 
   kungpu::PtxToCubinOptions copts;
-  copts.gpuArch   = targetCpu.empty() ? "sm_80" : targetCpu;
+  copts.gpuArch   = gpuArch.empty() ? "sm_80" : gpuArch;
   copts.ptxasPath = ptxasPath;
 
   kun_cuda::ExecutableData data;
@@ -278,6 +215,9 @@ PYBIND11_MODULE(kun_mlir, m) {
   m.doc() = "Bindings for the KunQuant MLIR compiler (kunir → PTX → CUBIN "
              "→ launch).";
 
+  // Programmatic kunir construction (Value/Type opaque wrappers, IRBuilder).
+  kun_mlir_py::registerIRBuilder(m);
+
   py::class_<PyModule>(m, "ModuleOp")
       .def("to_string", &PyModule::toString,
             "Return the textual MLIR form of the module.")
@@ -291,7 +231,7 @@ PYBIND11_MODULE(kun_mlir, m) {
 
   m.def("lower_to_ptx", &pyLowerToPtx,
          py::arg("module"),
-         py::arg("target_cpu")     = "sm_80",
+         py::arg("gpu_arch")       = "sm_80",
          py::arg("target_triple")  = "nvptx64-nvidia-cuda",
          py::arg("target_features") = "",
          py::arg("opt_level")      = 3u,
@@ -354,7 +294,7 @@ PYBIND11_MODULE(kun_mlir, m) {
          py::arg("module"),
          py::arg("graph_inputs"),
          py::arg("graph_outputs"),
-         py::arg("target_cpu")     = "sm_80",
+         py::arg("gpu_arch")       = "sm_80",
          py::arg("target_triple")  = "nvptx64-nvidia-cuda",
          py::arg("target_features") = "",
          py::arg("opt_level")      = 3u,
diff --git a/mlir/lib/Python/PyModule.h b/mlir/lib/Python/PyModule.h
new file mode 100644
index 0000000..58f2c06
--- /dev/null
+++ b/mlir/lib/Python/PyModule.h
@@ -0,0 +1,86 @@
+//===- PyModule.h - PyModule (MLIR ctx + ModuleOp) shared by bindings --===//
+//
+// Used by both MlirBinding.cpp (parse / compile entry points) and
+// IRBuilder.cpp (programmatic construction of a kunir module from Python).
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/LLVM.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunIr/KunIrDialect.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+namespace py = pybind11;
+
+namespace kun_mlir_py {
+
+class PyModule {
+public:
+  PyModule()
+      : ctx(std::make_unique<mlir::MLIRContext>(
+            makeRegistry(), mlir::MLIRContext::Threading::DISABLED)) {
+    ctx->loadAllAvailableDialects();
+  }
+
+  static mlir::DialectRegistry makeRegistry() {
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::arith::ArithDialect>();
+    registry.insert<mlir::cf::ControlFlowDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+    registry.insert<mlir::gpu::GPUDialect>();
+    registry.insert<mlir::index::IndexDialect>();
+    registry.insert<mlir::LLVM::LLVMDialect>();
+    registry.insert<mlir::math::MathDialect>();
+    registry.insert<mlir::NVVM::NVVMDialect>();
+    registry.insert<mlir::scf::SCFDialect>();
+    registry.insert<kunir::KunIrDialect>();
+    registry.insert<kungpu::KunGpuDialect>();
+    return registry;
+  }
+
+  static std::unique_ptr<PyModule> parse(const std::string &text) {
+    auto pm = std::make_unique<PyModule>();
+    pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
+    if (!pm->module)
+      throw std::runtime_error("kun_mlir.parse: failed to parse MLIR text");
+    return pm;
+  }
+
+  std::string toString() const {
+    std::string out;
+    llvm::raw_string_ostream os(out);
+    module.get().print(os);
+    os.flush();
+    return out;
+  }
+
+  std::unique_ptr<mlir::MLIRContext> ctx;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+};
+
+} // namespace kun_mlir_py
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 886cb3c..61821f4 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -53,7 +53,7 @@ def main() -> int:
 
     print()
     print(f"=== lower_to_ptx (target={args.target}, O3) ===")
-    ptx = kun_mlir.lower_to_ptx(mod, target_cpu=args.target, opt_level=3)
+    ptx = kun_mlir.lower_to_ptx(mod, gpu_arch=args.target, opt_level=3)
     assert "test_addsum" in ptx
     print(f"ok — produced {len(ptx)} bytes of PTX text")
 
@@ -71,7 +71,7 @@ def main() -> int:
     exe = kun_mlir.compile(mod2,
                             graph_inputs=["a", "b"],
                             graph_outputs=["sum"],
-                            target_cpu=args.target, opt_level=3)
+                            gpu_arch=args.target, opt_level=3)
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
     print(f"  launch_order           = {exe.launch_order}")
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
new file mode 100644
index 0000000..416f492
--- /dev/null
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""End-to-end test for the KunQuant Python-IR → MLIR → CUDA path.
+
+Builds a KunQuant Function with the high-level Op API, runs the same
+Driver.optimize() pipeline the CPU compileit uses, then compiles to a
+CUDA Executable via kun_mlir and validates against numpy.
+
+Two factors are exercised:
+  * elemwise: out = (a + b) * a - b * b
+  * windowed: ws  = WindowedSum(a + b, N)   (decomposes into
+                ForeachBackWindow + ReduceAdd inside the optimizer pass)
+
+Note: ops that lower to math.absf / math.log / math.copysign aren't
+exercised here yet — the kunir-to-LLVM pipeline doesn't link libdevice,
+so those would end up as unresolved __nv_* externals.  Once the math →
+LLVM intrinsic / libdevice lowering is wired up, swap in Abs/Log/Sign.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+
+import numpy as np
+
+from KunQuant.Op import Builder, Input, Output
+from KunQuant.ops import Add, Sub, Mul, WindowedSum
+from KunQuant.Stage import Function
+from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
+
+
+def build_func_elemwise() -> Function:
+    """out = (a + b) * a - b * b"""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        v = Sub(Mul(Add(a, bin_), a), Mul(bin_, bin_))
+        Output(v, "out")
+    return Function(builder.ops, name="elemwise_kernel")
+
+
+def build_func_windowed(N: int) -> Function:
+    """ws = WindowedSum(a + b, N)"""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        s = WindowedSum(Add(a, bin_), N)
+        Output(s, "ws")
+    return Function(builder.ops, name="windowed_kernel")
+
+
+def run_elemwise(target: str, T: int, S: int) -> int:
+    print("=== elemwise: out = (a+b)*a - b*b ===")
+    f = build_func_elemwise()
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    # Show the IR for sanity — same passes + translator, no compile.
+    mod = to_mlir(build_func_elemwise(), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "out": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+
+    expected = (a_h + b_h) * a_h - b_h * b_h
+    if not np.allclose(out_h, expected, atol=1e-5):
+        diff = np.abs(out_h - expected)
+        print(f"  FAIL — max abs diff {diff.max()}", file=sys.stderr)
+        return 1
+    print(f"  ok — output matches (a+b)*a - b*b on every cell ({T*S} cells)")
+    return 0
+
+
+def run_windowed(target: str, T: int, S: int, N: int) -> int:
+    print(f"=== windowed: ws = WindowedSum(a + b, N={N}) ===")
+    f = build_func_windowed(N)
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    mod = to_mlir(build_func_windowed(N), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(1)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "ws": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+
+    c = a_h + b_h
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)
+    expected = np.empty((T, S), dtype=np.float32)
+    expected[:N - 1] = np.nan
+    expected[N - 1] = cumsum[N - 1]
+    if T > N:
+        expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+
+    diff = np.abs(out_h[N - 1:] - expected[N - 1:])
+    max_abs = float(diff.max())
+    atol = max(1e-3, 5e-7 * N)
+    if max_abs > atol:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e} at {idx}",
+                file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e})")
+    return 0
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    ap.add_argument("-N", "--window", type=int, default=5)
+    args = ap.parse_args()
+
+    import cupy as cp
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    rc |= run_elemwise(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_windowed(args.target, args.time_length, args.num_stocks, args.window)
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index 6a7302a..bb2e6c9 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -63,7 +63,7 @@ def main() -> int:
     exe = kun_mlir.compile(mod,
                             graph_inputs=["a", "b", "c"],
                             graph_outputs=["out"],
-                            target_cpu=args.target, opt_level=3)
+                            gpu_arch=args.target, opt_level=3)
 
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index bad7b65..0545023 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -92,7 +92,7 @@ def run_one(N: int, expected_placement: str, target: str,
     exe = kun_mlir.compile(mod,
                             graph_inputs=["a", "b"],
                             graph_outputs=["out"],
-                            target_cpu=target, opt_level=3)
+                            gpu_arch=target, opt_level=3)
     print(f"  kernels={exe.kernel_names}  warps_per_cta={exe.warps_per_cta}  "
            f"vector_size={exe.vector_size}  cubin={len(exe.cubin)} bytes")
 

From 84b3376e9c29f5868fa9d6e5a0216b8046f79d9b Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 8 May 2026 00:13:30 -0700
Subject: [PATCH 12/59] use upstream gpu-to-binary. Support math functions

---
 KunQuant/jit/cuda.py                   |  63 +++-
 mlir/include/KunGpu/PtxBackend.h       | 129 ++++----
 mlir/lib/KunGpu/CMakeLists.txt         |  11 +-
 mlir/lib/KunGpu/PtxBackend.cpp         | 390 ++++++++-----------------
 mlir/lib/Python/CMakeLists.txt         |   6 +
 mlir/lib/Python/MlirBinding.cpp        |  94 +++---
 mlir/lib/Python/PyModule.cpp           | 101 +++++++
 mlir/lib/Python/PyModule.h             |  80 ++---
 mlir/test/python/test_kun_mlir.py      |  25 +-
 mlir/test/python/test_kun_to_cuda.py   |  74 +++--
 mlir/test/python/test_multi_kernel.py  |   4 +-
 mlir/test/python/test_windowed_temp.py |   4 +-
 12 files changed, 472 insertions(+), 509 deletions(-)
 create mode 100644 mlir/lib/Python/PyModule.cpp

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 6d7402f..b1bae78 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -21,6 +21,7 @@
 """
 
 from __future__ import annotations
+import os
 from dataclasses import dataclass
 from typing import Optional
 
@@ -31,6 +32,55 @@
 from KunQuant.passes.CodegenMLIR import TargetSpec, translate_function
 
 
+# Standard locations searched when CudaCompilerConfig.toolkit_path is left
+# empty.  A toolkit dir must contain `nvvm/libdevice/libdevice.10.bc` (the
+# upstream `gpu-module-to-binary` pass links libdevice into the LLVM
+# module) and `bin/ptxas` (PTX → cubin).
+_TOOLKIT_ENV_VARS  = ("CUDA_HOME", "CUDA_PATH", "CUDA_TOOLKIT_PATH",
+                       "CUDA_ROOT")
+_TOOLKIT_FALLBACKS = ("/usr/local/cuda", "/opt/cuda", "/opt/nvidia/cuda")
+
+
+def _is_toolkit_dir(path: str) -> bool:
+    return (path
+            and os.path.isfile(os.path.join(path, "nvvm", "libdevice",
+                                              "libdevice.10.bc"))
+            and os.path.isfile(os.path.join(path, "bin", "ptxas")))
+
+
+def find_cuda_toolkit(override: str = "") -> str:
+    """Locate a CUDA toolkit root suitable for `gpu-module-to-binary`.
+
+    Search order:
+      1. `override` (if non-empty and looks like a toolkit dir)
+      2. $CUDA_HOME / $CUDA_PATH / $CUDA_TOOLKIT_PATH / $CUDA_ROOT
+      3. Standard install paths (/usr/local/cuda, /opt/cuda, …)
+
+    Raises FileNotFoundError if nothing usable is found — the message
+    lists every location consulted so the caller can fix the env.
+    """
+    tried = []
+    if override:
+        tried.append(f"override={override!r}")
+        if _is_toolkit_dir(override):
+            return override
+    for env in _TOOLKIT_ENV_VARS:
+        val = os.environ.get(env, "")
+        if val:
+            tried.append(f"${env}={val!r}")
+            if _is_toolkit_dir(val):
+                return val
+    for fallback in _TOOLKIT_FALLBACKS:
+        tried.append(f"fallback={fallback!r}")
+        if _is_toolkit_dir(fallback):
+            return fallback
+    raise FileNotFoundError(
+        "Could not locate a CUDA toolkit (need "
+        "<root>/nvvm/libdevice/libdevice.10.bc and <root>/bin/ptxas). "
+        "Searched: " + ", ".join(tried) +
+        ". Set CUDA_PATH or pass toolkit_path explicitly.")
+
+
 @dataclass
 class CudaCompilerConfig:
     """Mirrors the parts of KunCompilerConfig that matter for GPU.
@@ -49,9 +99,11 @@ class CudaCompilerConfig:
     smem_size:     int = 49152
     vector_size:   int = 1
 
-    # ptx → cubin
+    # LLVM optimization level (forwarded to #nvvm.target<O = ...>).
     opt_level:     int  = 3
-    ptxas_path:    str  = ""
+    # Path to the CUDA toolkit (where libdevice.10.bc + ptxas live).
+    # Empty → upstream search: CUDA_HOME / CUDA_PATH / standard locations.
+    toolkit_path:  str  = ""
 
     # Pass-list options forwarded to optimize().  We seed reasonable GPU
     # defaults; user-supplied keys override.
@@ -103,6 +155,11 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.Executable:
             f"CudaCompilerConfig.dtype must be 'float' or 'double', got "
             f"{cfg.dtype!r}")
 
+    # Resolve the CUDA toolkit before invoking C++.  Auto-search if the
+    # user didn't pass an explicit path.  Failure here gives a useful
+    # message; failure later (in ptxas / libdevice link) is opaque.
+    toolkit_path = find_cuda_toolkit(cfg.toolkit_path)
+
     # 1.  Same optimizer pipeline the CPU path runs.  This is where
     #     WindowedSum etc. decompose into ForeachBackWindow + Reduce.
     options = _gpu_pass_options(cfg)
@@ -125,7 +182,7 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.Executable:
         graph_outputs=out_names,
         gpu_arch=cfg.gpu_arch,
         opt_level=cfg.opt_level,
-        ptxas_path=cfg.ptxas_path,
+        toolkit_path=toolkit_path,
     )
 
 
diff --git a/mlir/include/KunGpu/PtxBackend.h b/mlir/include/KunGpu/PtxBackend.h
index 01e51a3..16ed5b9 100644
--- a/mlir/include/KunGpu/PtxBackend.h
+++ b/mlir/include/KunGpu/PtxBackend.h
@@ -1,15 +1,24 @@
-//===- PtxBackend.h - Compile a kunir module all the way to PTX ---------===//
+//===- PtxBackend.h - Compile a kunir module to a CUDA cubin -----------===//
 //
-// Companion to `Pipelines.h` — runs the kunir-to-llvm dialect pipeline,
-// translates the resulting MLIR `gpu.module` to an `llvm::Module`, applies
-// the standard LLVM optimization pipeline (PassBuilder default
-// per-module pipeline, the same one mlir::makeOptimizingTransformer uses,
-// which is what upstream `gpu-module-to-binary` invokes via
-// `ModuleToObject::optimizeModule`), and finally emits PTX through
-// `NVPTXTargetMachine::addPassesToEmitFile(AssemblyFile)`.
+// Pipeline (single source of truth):
 //
-// This is the single C++ entry point downstream `kunir_to_ptx` callers
-// (host runtime, JIT) should plumb to.
+//   kunir → llvm dialect (our buildKunIrToLLVMPipeline)
+//        → upstream `gpu-module-to-binary{format=bin}`
+//        → cubin bytes pulled off the resulting `gpu.binary` op
+//
+// `gpu-module-to-binary` (via NVVMTargetAttrImpl) takes care of:
+//   * MLIR → LLVM IR translation,
+//   * libdevice.10.bc location + linking + AlwaysInline + DCE,
+//   * the LLVM optimization pipeline,
+//   * PTX emission via NVPTXTargetMachine,
+//   * ptxas invocation.
+//
+// We just attach an `#nvvm.target<chip = ..., O = ...>` to the gpu.module
+// and run the pass.  No more manual ptxas plumbing on the main path.
+//
+// `compileKunIrToPtx` is kept for **debug / inspection**: same pipeline,
+// but `format=isa` so we can read the PTX text instead of the cubin.
+// The main `compileKunIrToExecutable` does NOT route through it.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,92 +30,52 @@
 #include "llvm/ADT/StringRef.h"
 
 #include <string>
-#include <vector>
 
 namespace kungpu {
 
+/// Knobs forwarded to the upstream `#nvvm.target` attribute (which the
+/// `gpu-module-to-binary` pass reads via NVVMTargetAttrImpl).
+///
+/// `targetCpu` is the LLVM/NVPTX term — it carries the SM string
+/// ("sm_80", "sm_120", …) that ptxas / NVPTXTargetMachine consume.  The
+/// user-facing Python kwarg is `gpu_arch`.
 struct PtxCompileOptions {
-  /// LLVM optimization level (0..3, mapped to OptimizationLevel::O0..O3).
-  unsigned optLevel = 3;
-
-  /// LLVM size level (0..2). 0 disables size opts; rarely needed for GPU.
-  unsigned sizeLevel = 0;
+  unsigned    optLevel       = 3;       ///< maps to #nvvm.target<O = N>
+  std::string targetTriple   = "nvptx64-nvidia-cuda";
+  std::string targetCpu      = "sm_80"; ///< chip, e.g. "sm_120"
+  std::string targetFeatures;           ///< empty → derived from chip
 
-  /// SM target, e.g. "sm_80".  Defaults to a widely-supported value; the
-  /// caller should set it to whatever GPU it actually targets.
-  std::string targetTriple = "nvptx64-nvidia-cuda";
-  std::string targetCpu    = "sm_80";
-  /// Empty by default — let LLVM pick a PTX version compatible with the
-  /// chosen `targetCpu` (sm_80 → ptx70 etc., sm_120 → ptx87 etc.).
-  std::string targetFeatures;
+  /// Forwarded to gpu-module-to-binary's `toolkit` option.  Empty → the
+  /// pass searches CUDA_HOME / CUDA_PATH / standard paths.  Useful when
+  /// the right CUDA toolkit (the one with libdevice.10.bc + a matching
+  /// ptxas) isn't on PATH.
+  std::string toolkitPath;
 };
 
-/// End-to-end compile a `builtin.module` containing `gpu.module` kernels.
+/// Lower kunir → llvm dialect → emit PTX text.  **Debug / inspection
+/// only** — the main compile path goes straight to cubin.
 ///
-/// 1. Runs the kunir → LLVM dialect pipeline (see Pipelines.h).
-/// 2. Translates the LLVM-dialect module to llvm::Module via upstream
-///    `mlir::translateModuleToLLVMIR`.
-/// 3. Runs LLVM optimizations: `PassBuilder::buildPerModuleDefaultPipeline`
-///    at the chosen OptimizationLevel — this includes DCE, InstCombine,
-///    CSE, LICM, vectorization, etc.  The TargetMachine is the
-///    NVPTXTargetMachine for the requested SM, so target-specific
-///    pipeline tweaks fire too.
-/// 4. Emits PTX assembly via `TargetMachine::addPassesToEmitFile` with
-///    `CodeGenFileType::AssemblyFile`.
-///
-/// On success, `ptxOut` contains the PTX text.  On failure, returns
-/// `failure()` after reporting diagnostics through MLIR's standard
-/// channels.
+/// On success `ptxOut` holds the PTX assembly produced by the upstream
+/// `gpu-module-to-binary{format=isa}` pass.  Module is mutated in place
+/// (the gpu.module gets replaced with a gpu.binary op).
 ::mlir::LogicalResult compileKunIrToPtx(::mlir::ModuleOp module,
                                           const PtxCompileOptions &options,
                                           std::string &ptxOut);
 
-struct PtxToCubinOptions {
-  /// SM architecture to assemble for, e.g. "sm_80".
-  std::string gpuArch = "sm_80";
-  /// PTX ISA version for ptxas (passed via --gpu-name and -V).  Empty =
-  /// let ptxas choose its default.
-  std::string ptxasVersion;
-  /// Extra arguments forwarded verbatim to ptxas (e.g. {"-O3"}).
-  std::vector<std::string> extraArgs;
-  /// Optional override for the ptxas binary path.  When empty we search
-  /// PATH and the CUDA_HOME / CUDA_PATH env vars (same logic upstream
-  /// NVPTXSerializer uses).
-  std::string ptxasPath;
-};
-
-/// Assemble PTX text into a CUBIN binary.  This is the same operation
-/// upstream `NVPTXSerializer::compileToBinary` performs internally —
-/// shell out to `ptxas` — exposed as a standalone helper because the
-/// upstream class isn't part of the public C++ API.
-///
-/// On success, `cubinOut` contains the raw CUBIN bytes.
-::mlir::LogicalResult compilePtxToCubin(::llvm::StringRef ptx,
-                                          const PtxToCubinOptions &options,
-                                          std::vector<char> &cubinOut,
-                                          std::string &errorMsg);
-
-/// Compile-only: run the kunir → LLVM dialect pipeline, translate to
-/// LLVM IR, optimize, emit PTX, assemble to CUBIN, then walk the
-/// lowered module to populate the per-kernel name metadata (one
-/// `KernelMeta` per `llvm.func` carrying `kungpu.target_spec`).  The
-/// caller is expected to fill in `out.graphInputs` / `out.graphOutputs`
-/// before constructing a `kun_cuda::Executable` from the result —
-/// graph topology is a runtime concern, not a compile-time one.
+/// Main entry point: lower kunir, run gpu-module-to-binary{format=bin},
+/// and pull the cubin + per-kernel name metadata into an
+/// `ExecutableData` ready for `kun_cuda::Executable`.
 ///
-/// On success `out` is populated with: cubin, warpsPerCta, vectorSize
-/// (validated to be uniform across kernels), and the unordered list of
-/// kernels (each with its name and the input/output names from
-/// `kungpu.input_names` / `kungpu.output_names`).  Topology validation,
-/// topo sort, buffer indexing and slot planning all happen later, in
-/// the `Executable` ctor.
+/// Walks the lowered module for kernel metadata (name, target spec,
+/// I/O names) BEFORE the pass runs, since `gpu-module-to-binary`
+/// replaces the gpu.module with a gpu.binary op.  graphInputs /
+/// graphOutputs are NOT set here — the caller fills them on `out`
+/// after this returns (see KunCuda/Runtime.h).
 ///
-/// The module is mutated in-place by the pipeline (same as
-/// `compileKunIrToPtx`).
+/// The module is mutated in-place by the pipeline.
 ::mlir::LogicalResult
 compileKunIrToExecutable(::mlir::ModuleOp module,
-                          const PtxCompileOptions &ptxOpts,
-                          const PtxToCubinOptions &cubinOpts,
+                          const PtxCompileOptions &options,
                           ::kun_cuda::ExecutableData &out);
 
 } // namespace kungpu
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index e8a35e3..b51d2f7 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -34,11 +34,18 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRGPUToNVVMTransforms
   MLIRNVVMDialect
 
-  # LLVM IR translation + PTX emission
+  # Main path: kunir → llvm dialect → upstream gpu-module-to-binary.
+  # NVVMTarget supplies the serializeToObject impl that handles libdevice
+  # linking + LLVM optimization + PTX emission + ptxas invocation.
+  MLIRGPUTransforms
+  MLIRNVVMTarget
+  MLIRTargetLLVM
+
+  # LLVM IR translation registrations consumed transitively by
+  # NVVMTargetAttrImpl::serializeToObject.
   MLIRTargetLLVMIRExport
   MLIRBuiltinToLLVMIRTranslation
   MLIRLLVMToLLVMIRTranslation
   MLIRNVVMToLLVMIRTranslation
   MLIRGPUToLLVMIRTranslation
-  MLIRExecutionEngineUtils
 )
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
index e078db8..52ef710 100644
--- a/mlir/lib/KunGpu/PtxBackend.cpp
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -1,154 +1,23 @@
-//===- PtxBackend.cpp - Compile a kunir module all the way to PTX ------===//
+//===- PtxBackend.cpp - kunir → cubin (single upstream-pass pipeline) -===//
 
 #include "KunGpu/PtxBackend.h"
 #include "KunGpu/KunGpuUtils.h"
 #include "KunGpu/Pipelines.h"
 #include "KunIr/KunIrAttrs.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cstdlib>
-
-namespace kungpu {
-
-namespace {
-
-/// Search for `ptxas` in the user-provided override, then PATH, then
-/// CUDA_HOME / CUDA_PATH / standard CUDA install locations.  Mirrors the
-/// search the upstream NVPTXSerializer does.
-static llvm::ErrorOr<std::string> findPtxas(::llvm::StringRef override) {
-  using namespace llvm::sys;
-  if (!override.empty() && fs::exists(override))
-    return std::string(override);
-  if (auto p = findProgramByName("ptxas"))
-    return p;
-  for (const char *envName : {"CUDA_HOME", "CUDA_PATH", "CUDA_TOOLKIT_PATH"}) {
-    if (const char *envVal = std::getenv(envName)) {
-      llvm::SmallString<256> p(envVal);
-      path::append(p, "bin", "ptxas");
-      if (fs::exists(p))
-        return std::string(p);
-    }
-  }
-  if (fs::exists("/usr/local/cuda/bin/ptxas"))
-    return std::string("/usr/local/cuda/bin/ptxas");
-  return std::make_error_code(std::errc::no_such_file_or_directory);
-}
-
-} // namespace
-
-::mlir::LogicalResult compilePtxToCubin(::llvm::StringRef ptx,
-                                          const PtxToCubinOptions &opts,
-                                          std::vector<char> &cubinOut,
-                                          std::string &errorMsg) {
-  using namespace llvm;
-
-  auto ptxasOrErr = findPtxas(opts.ptxasPath);
-  if (!ptxasOrErr) {
-    errorMsg = "compilePtxToCubin: ptxas not found "
-                "(looked in CUDA_HOME / CUDA_PATH / PATH / "
-                "/usr/local/cuda/bin); set ptxas_path or CUDA_HOME.";
-    return ::mlir::failure();
-  }
-
-  // Write PTX to a temp file.
-  SmallString<128> ptxPath, cubinPath, logPath;
-  if (auto ec = sys::fs::createTemporaryFile("kun-ptx", "ptx", ptxPath)) {
-    errorMsg = "compilePtxToCubin: createTemporaryFile(ptx): " + ec.message();
-    return ::mlir::failure();
-  }
-  if (auto ec = sys::fs::createTemporaryFile("kun-cubin", "cubin", cubinPath)) {
-    sys::fs::remove(ptxPath);
-    errorMsg = "compilePtxToCubin: createTemporaryFile(cubin): " + ec.message();
-    return ::mlir::failure();
-  }
-  if (auto ec = sys::fs::createTemporaryFile("kun-ptxlog", "log", logPath)) {
-    sys::fs::remove(ptxPath); sys::fs::remove(cubinPath);
-    errorMsg = "compilePtxToCubin: createTemporaryFile(log): " + ec.message();
-    return ::mlir::failure();
-  }
-
-  // Auto-cleanup.
-  struct CleanupOnExit {
-    SmallVectorImpl<char> &p; ~CleanupOnExit() { sys::fs::remove(p); }
-  };
-  CleanupOnExit c1{ptxPath}, c2{cubinPath}, c3{logPath};
-
-  {
-    std::error_code ec;
-    raw_fd_ostream os(ptxPath, ec, sys::fs::OF_None);
-    if (ec) {
-      errorMsg = "compilePtxToCubin: writing PTX: " + ec.message();
-      return ::mlir::failure();
-    }
-    os << ptx;
-  }
-
-  // Build argv:
-  //   ptxas --gpu-name=<sm_xx> -o <cubin> <ptx> [extra...]
-  std::string gpuArg = "--gpu-name=" + opts.gpuArch;
-  std::string outArg = "-o";
-  SmallVector<StringRef> argv = {*ptxasOrErr, gpuArg, outArg, cubinPath, ptxPath};
-  for (const auto &a : opts.extraArgs) argv.push_back(a);
-
-  std::string errBuf;
-  std::optional<StringRef> redirects[] = {std::nullopt,        // stdin
-                                            StringRef(logPath),  // stdout
-                                            StringRef(logPath)}; // stderr
-  int rc = sys::ExecuteAndWait(*ptxasOrErr, argv, /*Env=*/std::nullopt,
-                                 redirects, /*SecondsToWait=*/0,
-                                 /*MemoryLimit=*/0, &errBuf);
-  if (rc != 0) {
-    auto logBuf = MemoryBuffer::getFile(logPath);
-    errorMsg = "compilePtxToCubin: ptxas failed (exit " + std::to_string(rc) + ")";
-    if (!errBuf.empty()) errorMsg += ": " + errBuf;
-    if (logBuf && (*logBuf)->getBufferSize() > 0) {
-      errorMsg += "\n--- ptxas log ---\n";
-      errorMsg += (*logBuf)->getBuffer().str();
-    }
-    return ::mlir::failure();
-  }
-
-  auto cubinBuf = MemoryBuffer::getFile(cubinPath);
-  if (!cubinBuf) {
-    errorMsg = "compilePtxToCubin: cannot read cubin: " +
-                  cubinBuf.getError().message();
-    return ::mlir::failure();
-  }
-  StringRef bytes = (*cubinBuf)->getBuffer();
-  cubinOut.assign(bytes.begin(), bytes.end());
-  return ::mlir::success();
-}
-
-} // namespace kungpu
 
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Export.h"
 
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/TargetSelect.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/TargetParser/Host.h"
 
 using namespace mlir;
 
@@ -156,157 +25,132 @@ namespace kungpu {
 
 namespace {
 
-/// Look up the LLVM target for the given triple, lazily initializing the
-/// NVPTX target & asmprinter once per process.
-static llvm::Expected<const llvm::Target *>
-lookupNvptxTarget(llvm::StringRef triple) {
-  static const bool kInit = [] {
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-    return true;
-  }();
-  (void)kInit;
-
-  std::string err;
-  const llvm::Target *t = llvm::TargetRegistry::lookupTarget(triple.str(), err);
-  if (!t)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(), err);
-  return t;
-}
-
-} // namespace
-
-LogicalResult compileKunIrToPtx(ModuleOp module,
-                                 const PtxCompileOptions &options,
-                                 std::string &ptxOut) {
-  MLIRContext *ctx = module.getContext();
+//===----------------------------------------------------------------------===//
+// Step 1: kunir → llvm dialect.  Same pipeline both compileKunIrToPtx and
+// compileKunIrToExecutable need before they hand off to upstream
+// gpu-module-to-binary.
+//===----------------------------------------------------------------------===//
 
-  // ─── Step 1.  Run the kunir → LLVM dialect pipeline ────────────────
-  PassManager pm(ctx);
+LogicalResult lowerKunIrToLLVMDialect(ModuleOp module) {
+  PassManager pm(module.getContext());
   buildKunIrToLLVMPipeline(pm);
   if (failed(pm.run(module)))
     return module.emitError(
-        "compileKunIrToPtx: kunir-to-llvm pipeline failed");
+        "compileKunIr*: kunir-to-llvm pipeline failed");
+  return success();
+}
 
-  // ─── Step 2.  Translate MLIR LLVM dialect → llvm::Module ──────────
-  // Register only the translations we actually need (builtin + LLVM +
-  // NVVM + GPU); the upstream `registerAllToLLVMIRTranslations` would
-  // pull in ArmSVE / SPIR-V / etc. and force us to link them all.
-  DialectRegistry registry;
-  registerBuiltinDialectTranslation(registry);
-  registerLLVMDialectTranslation(registry);
-  registerNVVMDialectTranslation(registry);
-  registerGPUDialectTranslation(registry);
-  ctx->appendDialectRegistry(registry);
+//===----------------------------------------------------------------------===//
+// Step 2: attach #nvvm.target to the (single) gpu.module so the upstream
+// pass knows what chip/O/etc. to compile for.  We do this by hand instead
+// of running `nvvm-attach-target` to keep the chip / O knobs typed and
+// avoid re-parsing the pass options string.
+//===----------------------------------------------------------------------===//
 
-  // Mirror upstream `gpu-module-to-binary` / NVPTXSerializer: translate
-  // the gpu.module (the kernel container) rather than the outer
-  // builtin.module — only the gpu.module's body is meant to become LLVM
-  // IR.  We just take the first gpu.module; multi-module support can
-  // come later.
+LogicalResult attachNvvmTarget(ModuleOp module,
+                                 const PtxCompileOptions &opts) {
   gpu::GPUModuleOp gpuMod;
-  module.walk([&](gpu::GPUModuleOp m) { gpuMod = m; return WalkResult::interrupt(); });
+  module.walk([&](gpu::GPUModuleOp m) {
+    gpuMod = m;
+    return WalkResult::interrupt();
+  });
   if (!gpuMod)
     return module.emitError(
-        "compileKunIrToPtx: no gpu.module found after lowering");
+        "compileKunIr*: no gpu.module found after the kunir-to-llvm "
+        "pipeline");
 
-  llvm::LLVMContext llvmCtx;
-  std::unique_ptr<llvm::Module> llvmModule =
-      translateModuleToLLVMIR(gpuMod, llvmCtx);
-  if (!llvmModule)
+  MLIRContext *ctx = module.getContext();
+  auto targetAttr = NVVM::NVVMTargetAttr::get(
+      ctx, /*optLevel=*/static_cast<int>(opts.optLevel),
+      /*triple=*/opts.targetTriple,
+      /*chip=*/opts.targetCpu,
+      /*features=*/opts.targetFeatures);
+  gpuMod.setTargetsAttr(ArrayAttr::get(ctx, {targetAttr}));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Step 3: run gpu-module-to-binary, then dig out the resulting object's
+// payload (PTX text or cubin bytes).
+//===----------------------------------------------------------------------===//
+
+LogicalResult runGpuModuleToBinary(ModuleOp module,
+                                     const std::string &compilationTarget,
+                                     const std::string &toolkitPath,
+                                     std::string &outBytes) {
+  // The Python wrapper (`KunQuant.jit.cuda.find_cuda_toolkit`) is
+  // responsible for resolving an empty toolkit path.  If it's still
+  // empty here, the caller is using the C++ API directly without
+  // hand-resolving — pass it on and let the upstream pass try its own
+  // (limited) defaults.
+  GpuModuleToBinaryPassOptions passOpts;
+  passOpts.compilationTarget = compilationTarget;   // "isa" (PTX) | "bin" (cubin)
+  passOpts.toolkitPath       = toolkitPath;
+
+  PassManager pm(module.getContext());
+  pm.addPass(createGpuModuleToBinaryPass(passOpts));
+  if (failed(pm.run(module)))
+    return module.emitError(
+        "compileKunIr*: gpu-module-to-binary{format=")
+        << compilationTarget << "} failed";
+
+  // The pass replaces every gpu.module with a gpu.binary holding one
+  // gpu.object per target attribute.  We attached exactly one target,
+  // so we expect one binary with one object — pull its bytes out.
+  gpu::BinaryOp binary;
+  module.walk([&](gpu::BinaryOp op) {
+    binary = op;
+    return WalkResult::interrupt();
+  });
+  if (!binary)
     return module.emitError(
-        "compileKunIrToPtx: translation to LLVM IR failed");
+        "compileKunIr*: gpu-module-to-binary produced no gpu.binary "
+        "(target attr missing on gpu.module?)");
 
-  // ─── Step 3.  Build NVPTXTargetMachine ────────────────────────────
-  auto targetOrErr = lookupNvptxTarget(options.targetTriple);
-  if (!targetOrErr) {
-    llvm::handleAllErrors(targetOrErr.takeError(),
-                          [&](const llvm::ErrorInfoBase &eib) {
-                            module.emitError(
-                                "compileKunIrToPtx: NVPTX target lookup: ")
-                                << eib.message();
-                          });
-    return failure();
-  }
-  llvm::TargetOptions opts;
-  std::unique_ptr<llvm::TargetMachine> targetMachine{
-      (*targetOrErr)
-          ->createTargetMachine(llvm::Triple(options.targetTriple),
-                                options.targetCpu, options.targetFeatures,
-                                opts, /*RelocModel=*/std::nullopt,
-                                /*CodeModel=*/std::nullopt,
-                                llvm::CodeGenOptLevel::Aggressive)};
-  if (!targetMachine)
+  ArrayAttr objects = binary.getObjectsAttr();
+  if (!objects || objects.empty())
     return module.emitError(
-        "compileKunIrToPtx: failed to create NVPTXTargetMachine");
+        "compileKunIr*: gpu.binary has no objects");
+  auto obj = llvm::dyn_cast<gpu::ObjectAttr>(objects[0]);
+  if (!obj)
+    return module.emitError(
+        "compileKunIr*: gpu.binary's first object is not a #gpu.object");
 
-  llvmModule->setTargetTriple(llvm::Triple(options.targetTriple));
-  llvmModule->setDataLayout(targetMachine->createDataLayout());
+  StringAttr payload = obj.getObject();
+  outBytes.assign(payload.getValue().begin(), payload.getValue().end());
+  return success();
+}
 
-  // ─── Step 4.  Run LLVM PassBuilder default pipeline ───────────────
-  // This is the same entry point upstream `gpu-module-to-binary` uses
-  // (see ModuleToObject::optimizeModule → makeOptimizingTransformer).
-  // It builds the full new-PM per-module pipeline at the requested O level,
-  // which includes mem2reg, SROA, GVN, LICM, instcombine, DCE, vectorise,
-  // unroll, etc., plus NVPTX-specific tweaks (the TargetMachine is passed
-  // to PassBuilder so its pipeline-tuning hooks fire).
-  if (auto err = makeOptimizingTransformer(options.optLevel,
-                                            options.sizeLevel,
-                                            targetMachine.get())(
-          llvmModule.get())) {
-    llvm::handleAllErrors(std::move(err),
-                          [&](const llvm::ErrorInfoBase &eib) {
-                            module.emitError(
-                                "compileKunIrToPtx: LLVM opt pipeline: ")
-                                << eib.message();
-                          });
-    return failure();
-  }
+} // namespace
 
-  // ─── Step 5.  Emit PTX (AssemblyFile) via legacy codegen pipeline ─
-  // This is the standard path used by `mlir::ModuleToObject`: the legacy
-  // PassManager is required because `addPassesToEmitFile` is a legacy
-  // codegen API.  The new PM ran in step 4 — codegen still uses legacy.
-  llvm::SmallString<0> ptxBuf;
-  {
-    llvm::raw_svector_ostream stream(ptxBuf);
-    llvm::buffer_ostream bufStream(stream);
-    llvm::legacy::PassManager codegenPM;
-    if (targetMachine->addPassesToEmitFile(
-            codegenPM, bufStream, /*DwoOut=*/nullptr,
-            llvm::CodeGenFileType::AssemblyFile)) {
-      return module.emitError(
-          "compileKunIrToPtx: NVPTXTargetMachine cannot emit assembly");
-    }
-    codegenPM.run(*llvmModule);
-  }
-  ptxOut.assign(ptxBuf.begin(), ptxBuf.end());
-  return success();
+//===----------------------------------------------------------------------===//
+// Public: PTX (debug / inspection)
+//===----------------------------------------------------------------------===//
+
+LogicalResult compileKunIrToPtx(ModuleOp module,
+                                  const PtxCompileOptions &options,
+                                  std::string &ptxOut) {
+  if (failed(lowerKunIrToLLVMDialect(module))) return failure();
+  if (failed(attachNvvmTarget(module, options))) return failure();
+  return runGpuModuleToBinary(module, /*compilationTarget=*/"isa",
+                                options.toolkitPath, ptxOut);
 }
 
 //===----------------------------------------------------------------------===//
-// All-in-one: kunir → cubin + per-kernel name metadata
-//
-// Compile-time concerns only.  Topology / topo sort / buffer indices /
-// slot planning all happen later, in `kun_cuda::Executable`'s ctor —
-// see KunCuda/Runtime.h.
+// Public: kunir → cubin + per-kernel name metadata
 //===----------------------------------------------------------------------===//
 
 LogicalResult compileKunIrToExecutable(ModuleOp module,
-                                        const PtxCompileOptions &ptxOpts,
-                                        const PtxToCubinOptions &cubinOpts,
+                                        const PtxCompileOptions &options,
                                         ::kun_cuda::ExecutableData &out) {
-  // 1.  Run the kunir → LLVM dialect pipeline + emit PTX.  This mutates
-  //     `module` in place so the discardable kunir metadata ends up on
-  //     each lowered llvm.func.
-  std::string ptx;
-  if (failed(compileKunIrToPtx(module, ptxOpts, ptx)))
-    return failure();
+  // 1.  kunir → llvm dialect.  After this the gpu.module body is fully
+  //     lowered and our discardable kungpu.* attrs sit on llvm.func ops.
+  if (failed(lowerKunIrToLLVMDialect(module))) return failure();
 
   // 2.  Walk every kernel function (carries kungpu.target_spec) and
-  //     emit a KernelMeta with names and target spec.
+  //     gather its name + I/O lists.  Must happen BEFORE the next pass
+  //     since gpu-module-to-binary replaces the gpu.module with a
+  //     gpu.binary that has no llvm.func to walk.
   std::vector<::kun_cuda::KernelMeta> kernels;
   std::vector<std::pair<int64_t, int64_t>> targetSpecs;  // (warps, vector)
   std::vector<std::string> targetSpecOwners;             // for diagnostics
@@ -339,8 +183,7 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
         "compileKunIrToExecutable: no llvm.func with kungpu metadata "
         "found in the lowered module");
 
-  // 3.  Target spec must be uniform across kernels (block / grid config
-  //     is graph-wide in v0).
+  // 3.  Validate target spec is graph-wide.
   auto [warpsPerCta, vectorSize] = targetSpecs.front();
   for (size_t i = 1; i < targetSpecs.size(); ++i) {
     auto [w, v] = targetSpecs[i];
@@ -354,16 +197,17 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
           << " vector_size=" << vectorSize << ")";
   }
 
-  // 4.  Assemble PTX → CUBIN.
-  std::vector<char> cubin;
-  std::string err;
-  if (failed(compilePtxToCubin(ptx, cubinOpts, cubin, err)))
-    return module.emitError("compileKunIrToExecutable: ") << err;
+  // 4.  Attach #nvvm.target + run gpu-module-to-binary{format=bin}.
+  if (failed(attachNvvmTarget(module, options))) return failure();
+  std::string cubin;
+  if (failed(runGpuModuleToBinary(module, /*compilationTarget=*/"bin",
+                                    options.toolkitPath, cubin)))
+    return failure();
 
   // 5.  Populate `out`.  graphInputs / graphOutputs are caller-supplied
   //     after this returns — leave them empty.
   out = ::kun_cuda::ExecutableData{};
-  out.cubin       = std::move(cubin);
+  out.cubin.assign(cubin.begin(), cubin.end());
   out.warpsPerCta = warpsPerCta;
   out.vectorSize  = vectorSize;
   out.kernels     = std::move(kernels);
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index 91a9bd7..3372a56 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -13,6 +13,7 @@ string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER
 pybind11_add_module(kun_mlir SHARED
   MlirBinding.cpp
   IRBuilder.cpp
+  PyModule.cpp
 )
 
 # Co-locate the binding with libKunCudaRuntime.so so $ORIGIN
@@ -53,6 +54,11 @@ target_link_libraries(kun_mlir PRIVATE
   MLIRGPUToNVVMTransforms
   MLIRReconcileUnrealizedCasts
 
+  # Required for `gpu-module-to-binary` (the upstream pass we drive in
+  # PtxBackend.cpp) + the NVVM target serializer it dispatches to.
+  MLIRGPUTransforms
+  MLIRNVVMTarget
+
   # Runtime side — owns cuda.h + libcuda; we just hand it ExecutableData
   # and call launch().
   KunCudaRuntime
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 0908a76..c2b54a1 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -1,13 +1,17 @@
-//===- MlirBinding.cpp - Python bindings for the kunir → PTX flow ------===//
+//===- MlirBinding.cpp - Python bindings for the kunir → cubin flow ----===//
 //
 // Exposes:
 //   kun_mlir.parse(text)            → ModuleOp     (loads MLIR text)
 //   ModuleOp.to_string() / __str__  → str          (dumps the module)
-//   kun_mlir.lower_to_ptx(mod, …)   → str          (kunir → PTX)
-//   kun_mlir.ptx_to_cubin(ptx, …)   → bytes        (PTX → CUBIN via ptxas)
+//   kun_mlir.lower_to_ptx(mod, …)   → str          (kunir → PTX, debug only)
 //   kun_mlir.compile(mod, …)        → Executable   (kunir → loadable kernel)
 //   Executable.launch({name: cupy}) → None         (cuLaunchKernel + sync)
 //
+// `compile` is the main path; `lower_to_ptx` is for inspecting the
+// intermediate PTX text that the upstream `gpu-module-to-binary` pass
+// produces (with `format=isa`).  Both go through the same lowering
+// pipeline — see PtxBackend.h.
+//
 //===----------------------------------------------------------------------===//
 
 #include <pybind11/pybind11.h>
@@ -34,22 +38,20 @@ using kun_mlir_py::PyModule;
 namespace {
 
 //===----------------------------------------------------------------------===//
-// One-shot helpers
+// PTX inspection (debug)
 //===----------------------------------------------------------------------===//
 
 static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
                                   const std::string &targetTriple,
                                   const std::string &targetFeatures,
                                   unsigned optLevel,
-                                  unsigned sizeLevel) {
+                                  const std::string &toolkitPath) {
   kungpu::PtxCompileOptions opts;
-  // `targetCpu` is what LLVM's TargetMachine API calls the SM arch — for
-  // NVPTX the "CPU" string IS the GPU compute capability ("sm_80" etc.).
   if (!gpuArch.empty())        opts.targetCpu      = gpuArch;
   if (!targetTriple.empty())   opts.targetTriple   = targetTriple;
   if (!targetFeatures.empty()) opts.targetFeatures = targetFeatures;
-  opts.optLevel  = optLevel;
-  opts.sizeLevel = sizeLevel;
+  opts.optLevel    = optLevel;
+  opts.toolkitPath = toolkitPath;
 
   std::string ptx;
   if (failed(kungpu::compileKunIrToPtx(pm.module.get(), opts, ptx)))
@@ -57,23 +59,6 @@ static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
   return ptx;
 }
 
-static py::bytes pyPtxToCubin(const std::string &ptx,
-                                const std::string &gpuArch,
-                                const std::vector<std::string> &extraArgs,
-                                const std::string &ptxasPath) {
-  kungpu::PtxToCubinOptions opts;
-  if (!gpuArch.empty())   opts.gpuArch   = gpuArch;
-  if (!ptxasPath.empty()) opts.ptxasPath = ptxasPath;
-  opts.extraArgs = extraArgs;
-
-  std::vector<char> cubin;
-  std::string errMsg;
-  if (failed(kungpu::compilePtxToCubin(ptx, opts, cubin, errMsg)))
-    throw std::runtime_error(errMsg.empty() ? "kun_mlir.ptx_to_cubin failed"
-                                              : errMsg);
-  return py::bytes(cubin.data(), cubin.size());
-}
-
 //===----------------------------------------------------------------------===//
 // pybind glue: read CAI dict → kun_cuda::DeviceArray, build name list
 //===----------------------------------------------------------------------===//
@@ -179,7 +164,7 @@ pyCompile(PyModule &pm,
             const std::string &gpuArch,
             const std::string &targetTriple,
             const std::string &targetFeatures, unsigned optLevel,
-            unsigned sizeLevel, const std::string &ptxasPath) {
+            const std::string &toolkitPath) {
   if (graphInputs.empty())
     throw std::runtime_error(
         "kun_mlir.compile: graph_inputs cannot be empty");
@@ -187,20 +172,15 @@ pyCompile(PyModule &pm,
     throw std::runtime_error(
         "kun_mlir.compile: graph_outputs cannot be empty");
 
-  kungpu::PtxCompileOptions popts;
-  if (!gpuArch.empty())        popts.targetCpu      = gpuArch;
-  if (!targetTriple.empty())   popts.targetTriple   = targetTriple;
-  if (!targetFeatures.empty()) popts.targetFeatures = targetFeatures;
-  popts.optLevel  = optLevel;
-  popts.sizeLevel = sizeLevel;
-
-  kungpu::PtxToCubinOptions copts;
-  copts.gpuArch   = gpuArch.empty() ? "sm_80" : gpuArch;
-  copts.ptxasPath = ptxasPath;
+  kungpu::PtxCompileOptions opts;
+  if (!gpuArch.empty())        opts.targetCpu      = gpuArch;
+  if (!targetTriple.empty())   opts.targetTriple   = targetTriple;
+  if (!targetFeatures.empty()) opts.targetFeatures = targetFeatures;
+  opts.optLevel    = optLevel;
+  opts.toolkitPath = toolkitPath;
 
   kun_cuda::ExecutableData data;
-  if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), popts, copts,
-                                                data)))
+  if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), opts, data)))
     throw std::runtime_error("kun_mlir.compile failed");
   // Graph topology is a runtime concern — fill it in here, just before
   // handing off to Executable's ctor (which validates + plans).
@@ -235,15 +215,10 @@ PYBIND11_MODULE(kun_mlir, m) {
          py::arg("target_triple")  = "nvptx64-nvidia-cuda",
          py::arg("target_features") = "",
          py::arg("opt_level")      = 3u,
-         py::arg("size_level")     = 0u,
-         "Lower kunir → PTX text.  Returns a Python str.");
-
-  m.def("ptx_to_cubin", &pyPtxToCubin,
-         py::arg("ptx"),
-         py::arg("gpu_arch")   = "sm_80",
-         py::arg("extra_args") = std::vector<std::string>{},
-         py::arg("ptxas_path") = "",
-         "Assemble PTX → CUBIN via ptxas.  Returns bytes.");
+         py::arg("toolkit_path")   = "",
+         "Lower kunir → PTX text via the upstream `gpu-module-to-binary` "
+         "pass with `format=isa`.  Debug / inspection only — the main "
+         "compile path goes straight to cubin.");
 
   py::class_<kun_cuda::Executable>(m, "Executable")
       .def_property_readonly("input_names",   &kun_cuda::Executable::graphInputs,
@@ -298,12 +273,19 @@ PYBIND11_MODULE(kun_mlir, m) {
          py::arg("target_triple")  = "nvptx64-nvidia-cuda",
          py::arg("target_features") = "",
          py::arg("opt_level")      = 3u,
-         py::arg("size_level")     = 0u,
-         py::arg("ptxas_path")     = "",
-         "Compile a kunir module all the way to a loaded Executable "
-         "(kunir → LLVM dialect → LLVM IR → PTX → CUBIN → cuModuleLoad). "
-         "graph_inputs / graph_outputs name the buffers that flow in/out "
-         "of the whole kernel graph; everything else produced by the "
-         "kernels is treated as an intermediate and gets a runtime-managed "
-         "slot.");
+         py::arg("toolkit_path")   = "",
+         "Compile a kunir module all the way to a loaded Executable.\n"
+         "\n"
+         "Pipeline: kunir → LLVM dialect → upstream `gpu-module-to-binary`\n"
+         "(format=bin) which handles libdevice linking + LLVM optimization\n"
+         "+ PTX emission + ptxas, → cuModuleLoad on the resulting cubin.\n"
+         "\n"
+         "graph_inputs / graph_outputs name the buffers that flow in/out\n"
+         "of the whole kernel graph; everything else produced by the\n"
+         "kernels is treated as an intermediate and gets a runtime-managed\n"
+         "slot.\n"
+         "\n"
+         "toolkit_path: optional path to the CUDA toolkit (where\n"
+         "libdevice.10.bc and ptxas live).  Empty → search CUDA_HOME /\n"
+         "CUDA_PATH / standard install locations.");
 }
diff --git a/mlir/lib/Python/PyModule.cpp b/mlir/lib/Python/PyModule.cpp
new file mode 100644
index 0000000..7089ff9
--- /dev/null
+++ b/mlir/lib/Python/PyModule.cpp
@@ -0,0 +1,101 @@
+//===- PyModule.cpp - dialect / translation / target registration -----===//
+//
+// Everything that touches a specific dialect or translation lives here,
+// not in PyModule.h, so consumers of `class PyModule` only pay for the
+// MLIRContext + ModuleOp typedefs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PyModule.h"
+
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Parser/Parser.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+// Required for `gpu-module-to-binary` to dispatch to the NVVM target
+// implementation (libdevice link + LLVM opt + ptxas).
+#include "mlir/Target/LLVM/NVVM/Target.h"
+
+// MLIR → LLVM IR translation registrations consumed by the NVVM target
+// serializer.  Keep the list minimal — `registerAllToLLVMIRTranslations`
+// would force linking ArmSVE / SPIR-V / etc.
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunIr/KunIrDialect.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+#include <stdexcept>
+
+namespace kun_mlir_py {
+
+namespace {
+
+mlir::DialectRegistry makeRegistry() {
+  mlir::DialectRegistry registry;
+
+  registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::cf::ControlFlowDialect>();
+  registry.insert<mlir::func::FuncDialect>();
+  registry.insert<mlir::gpu::GPUDialect>();
+  registry.insert<mlir::index::IndexDialect>();
+  registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::math::MathDialect>();
+  registry.insert<mlir::NVVM::NVVMDialect>();
+  registry.insert<mlir::scf::SCFDialect>();
+  registry.insert<kunir::KunIrDialect>();
+  registry.insert<kungpu::KunGpuDialect>();
+
+  // Wire up `#nvvm.target`'s serializeToObject impl so
+  // `gpu-module-to-binary` can lower gpu.module → cubin / PTX.
+  mlir::NVVM::registerNVVMTargetInterfaceExternalModels(registry);
+  // ...and the dialect → LLVM IR translation hooks the NVVM target
+  // calls once it has its hands on the gpu.module body.
+  mlir::registerBuiltinDialectTranslation(registry);
+  mlir::registerLLVMDialectTranslation(registry);
+  mlir::registerNVVMDialectTranslation(registry);
+  mlir::registerGPUDialectTranslation(registry);
+  return registry;
+}
+
+} // namespace
+
+PyModule::PyModule()
+    : ctx(std::make_unique<mlir::MLIRContext>(
+          makeRegistry(), mlir::MLIRContext::Threading::DISABLED)) {
+  ctx->loadAllAvailableDialects();
+}
+
+PyModule::~PyModule() = default;
+
+std::unique_ptr<PyModule> PyModule::parse(const std::string &text) {
+  auto pm = std::make_unique<PyModule>();
+  pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
+  if (!pm->module)
+    throw std::runtime_error("kun_mlir.parse: failed to parse MLIR text");
+  return pm;
+}
+
+std::string PyModule::toString() const {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  module.get().print(os);
+  os.flush();
+  return out;
+}
+
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/PyModule.h b/mlir/lib/Python/PyModule.h
index 58f2c06..5490c01 100644
--- a/mlir/lib/Python/PyModule.h
+++ b/mlir/lib/Python/PyModule.h
@@ -1,86 +1,42 @@
 //===- PyModule.h - PyModule (MLIR ctx + ModuleOp) shared by bindings --===//
 //
 // Used by both MlirBinding.cpp (parse / compile entry points) and
-// IRBuilder.cpp (programmatic construction of a kunir module from Python).
+// IRBuilder.cpp (programmatic construction of a kunir module from
+// Python).  The header is deliberately thin: dialect / translation /
+// target registrations all live in PyModule.cpp so nobody pays for them
+// transitively.
 //
 //===----------------------------------------------------------------------===//
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/DialectRegistry.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Support/LLVM.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Index/IR/IndexDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-
-#include "KunGpu/KunGpuDialect.h"
-#include "KunIr/KunIrDialect.h"
-
-#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"   // mlir::ModuleOp
+#include "mlir/IR/OwningOpRef.h"  // mlir::OwningOpRef
 
 #include <memory>
-#include <stdexcept>
 #include <string>
 
-namespace py = pybind11;
+namespace mlir { class MLIRContext; }
 
 namespace kun_mlir_py {
 
 class PyModule {
 public:
-  PyModule()
-      : ctx(std::make_unique<mlir::MLIRContext>(
-            makeRegistry(), mlir::MLIRContext::Threading::DISABLED)) {
-    ctx->loadAllAvailableDialects();
-  }
-
-  static mlir::DialectRegistry makeRegistry() {
-    mlir::DialectRegistry registry;
-    registry.insert<mlir::arith::ArithDialect>();
-    registry.insert<mlir::cf::ControlFlowDialect>();
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<mlir::gpu::GPUDialect>();
-    registry.insert<mlir::index::IndexDialect>();
-    registry.insert<mlir::LLVM::LLVMDialect>();
-    registry.insert<mlir::math::MathDialect>();
-    registry.insert<mlir::NVVM::NVVMDialect>();
-    registry.insert<mlir::scf::SCFDialect>();
-    registry.insert<kunir::KunIrDialect>();
-    registry.insert<kungpu::KunGpuDialect>();
-    return registry;
-  }
+  PyModule();                        // sets up ctx + dialects + registrations
+  ~PyModule();                       // out-of-line so MLIRContext can stay
+                                      // forward-declared in this header
+  PyModule(const PyModule &)            = delete;
+  PyModule &operator=(const PyModule &) = delete;
 
-  static std::unique_ptr<PyModule> parse(const std::string &text) {
-    auto pm = std::make_unique<PyModule>();
-    pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
-    if (!pm->module)
-      throw std::runtime_error("kun_mlir.parse: failed to parse MLIR text");
-    return pm;
-  }
+  /// Parse an MLIR text fragment into a fresh PyModule.  Throws on
+  /// parse failure.
+  static std::unique_ptr<PyModule> parse(const std::string &text);
 
-  std::string toString() const {
-    std::string out;
-    llvm::raw_string_ostream os(out);
-    module.get().print(os);
-    os.flush();
-    return out;
-  }
+  /// Pretty-print the held module.
+  std::string toString() const;
 
   std::unique_ptr<mlir::MLIRContext> ctx;
-  mlir::OwningOpRef<mlir::ModuleOp> module;
+  mlir::OwningOpRef<mlir::ModuleOp>  module;
 };
 
 } // namespace kun_mlir_py
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 61821f4..9cd8e08 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """End-to-end test for the `kun_mlir` Python bindings.
 
-  parse → to_string → lower_to_ptx → ptx_to_cubin → compile → launch
+  parse → to_string → lower_to_ptx (debug only) → compile → launch
 
 Usage:
     PATH=$CUDA_BIN:$PATH PYTHONPATH=<build>/mlir/lib/Python \
@@ -39,6 +39,7 @@ def main() -> int:
     import kun_mlir
     import cupy as cp
     import numpy as np
+    from KunQuant.jit.cuda import find_cuda_toolkit
 
     # Force-initialise the CUDA driver + create the primary context now,
     # so subsequent kun_mlir.compile() / Executable.launch() find one.
@@ -51,27 +52,27 @@ def main() -> int:
     assert "kunir.func @test_addsum" in text, "module text missing kunir.func"
     print("ok — module round-trips through parse/to_string")
 
+    toolkit = find_cuda_toolkit()
+
     print()
-    print(f"=== lower_to_ptx (target={args.target}, O3) ===")
-    ptx = kun_mlir.lower_to_ptx(mod, gpu_arch=args.target, opt_level=3)
+    print(f"=== lower_to_ptx (target={args.target}, O3, debug only) ===")
+    # Debug entry point — same lowering pipeline as compile() but stops
+    # at PTX text via gpu-module-to-binary{format=isa}.  Mutates `mod`
+    # (replaces the gpu.module with a gpu.binary), so we re-parse for
+    # the main compile step below.
+    ptx = kun_mlir.lower_to_ptx(mod, gpu_arch=args.target, opt_level=3,
+                                  toolkit_path=toolkit)
     assert "test_addsum" in ptx
     print(f"ok — produced {len(ptx)} bytes of PTX text")
 
-    print()
-    print(f"=== ptx_to_cubin ({args.target}) ===")
-    cubin = kun_mlir.ptx_to_cubin(ptx, gpu_arch=args.target)
-    assert isinstance(cubin, bytes) and cubin[:4] == b"\x7fELF"
-    print(f"ok — produced {len(cubin)} bytes of CUBIN (ELF magic verified)")
-
     print()
     print(f"=== compile (all-in-one) ===")
-    # `mod` was already mutated by lower_to_ptx above; re-parse so compile()
-    # gets a fresh kunir.func module.
     mod2 = kun_mlir.parse(SAMPLE_KUNIR)
     exe = kun_mlir.compile(mod2,
                             graph_inputs=["a", "b"],
                             graph_outputs=["sum"],
-                            gpu_arch=args.target, opt_level=3)
+                            gpu_arch=args.target, opt_level=3,
+                            toolkit_path=toolkit)
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
     print(f"  launch_order           = {exe.launch_order}")
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 416f492..bd51656 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -5,15 +5,19 @@
 Driver.optimize() pipeline the CPU compileit uses, then compiles to a
 CUDA Executable via kun_mlir and validates against numpy.
 
-Two factors are exercised:
-  * elemwise: out = (a + b) * a - b * b
-  * windowed: ws  = WindowedSum(a + b, N)   (decomposes into
-                ForeachBackWindow + ReduceAdd inside the optimizer pass)
-
-Note: ops that lower to math.absf / math.log / math.copysign aren't
-exercised here yet — the kunir-to-LLVM pipeline doesn't link libdevice,
-so those would end up as unresolved __nv_* externals.  Once the math →
-LLVM intrinsic / libdevice lowering is wired up, swap in Abs/Log/Sign.
+Three factors are exercised:
+  * elemwise:   out = (a + b) * a - b * b           (binary elemwise only;
+                  doesn't touch libdevice)
+  * libdevice:  out = log(abs(a)) * sign(b - a)     (Abs / Log / Sign all
+                  lower to math.* ops that emit __nv_* libdevice externs;
+                  the upstream `gpu-module-to-binary` pass links libdevice
+                  for us, so this works end-to-end)
+  * windowed:   ws  = WindowedSum(a + b, N)         (decomposes into
+                  ForeachBackWindow + ReduceAdd inside the optimizer pass)
+
+The runtime auto-discovers the CUDA toolkit (CUDA_HOME / CUDA_PATH /
+CUDA_TOOLKIT_PATH / CUDA_ROOT or standard install paths).  Override
+with `CudaCompilerConfig(toolkit_path=...)` if needed.
 """
 
 from __future__ import annotations
@@ -23,7 +27,7 @@
 import numpy as np
 
 from KunQuant.Op import Builder, Input, Output
-from KunQuant.ops import Add, Sub, Mul, WindowedSum
+from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum
 from KunQuant.Stage import Function
 from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
 
@@ -39,6 +43,18 @@ def build_func_elemwise() -> Function:
     return Function(builder.ops, name="elemwise_kernel")
 
 
+def build_func_libdevice() -> Function:
+    """out = log(abs(a)) * sign(b - a) — exercises Abs/Log/Sign, all of
+    which lower to math.* ops that need libdevice."""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        v = Mul(Log(Abs(a)), Sign(Sub(bin_, a)))
+        Output(v, "out")
+    return Function(builder.ops, name="libdevice_kernel")
+
+
 def build_func_windowed(N: int) -> Function:
     """ws = WindowedSum(a + b, N)"""
     builder = Builder()
@@ -50,13 +66,14 @@ def build_func_windowed(N: int) -> Function:
     return Function(builder.ops, name="windowed_kernel")
 
 
-def run_elemwise(target: str, T: int, S: int) -> int:
-    print("=== elemwise: out = (a+b)*a - b*b ===")
-    f = build_func_elemwise()
+def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
+              atol: float = 1e-5) -> int:
+    """Compile a Function, launch it, validate against numpy."""
+    print(f"=== {label} ===")
+    f = build_fn()
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    # Show the IR for sanity — same passes + translator, no compile.
-    mod = to_mlir(build_func_elemwise(), cfg)
+    mod = to_mlir(build_fn(), cfg)
     print("--- mlir ---")
     print(mod.to_string())
 
@@ -74,15 +91,32 @@ def run_elemwise(target: str, T: int, S: int) -> int:
     cp.cuda.runtime.deviceSynchronize()
     out_h = cp.asnumpy(out)
 
-    expected = (a_h + b_h) * a_h - b_h * b_h
-    if not np.allclose(out_h, expected, atol=1e-5):
+    expected = expected_fn(a_h, b_h)
+    if not np.allclose(out_h, expected, atol=atol, equal_nan=True):
         diff = np.abs(out_h - expected)
-        print(f"  FAIL — max abs diff {diff.max()}", file=sys.stderr)
+        print(f"  FAIL — max abs diff {np.nanmax(diff)}", file=sys.stderr)
         return 1
-    print(f"  ok — output matches (a+b)*a - b*b on every cell ({T*S} cells)")
+    print(f"  ok — output matches reference on every cell ({T*S} cells)")
     return 0
 
 
+def run_elemwise(target: str, T: int, S: int) -> int:
+    return _run_one("elemwise: out = (a+b)*a - b*b",
+                     build_func_elemwise,
+                     lambda a, b: (a + b) * a - b * b,
+                     target, T, S)
+
+
+def run_libdevice(target: str, T: int, S: int) -> int:
+    # `sign` differs between MLIR (math.copysign — keeps sign bit incl 0)
+    # and numpy.sign (returns 0 at 0).  With Gaussian inputs the
+    # sign-of-zero case has measure zero, so equality holds.
+    return _run_one("libdevice: out = log(abs(a)) * sign(b - a)",
+                     build_func_libdevice,
+                     lambda a, b: np.log(np.abs(a)) * np.sign(b - a),
+                     target, T, S, atol=1e-4)
+
+
 def run_windowed(target: str, T: int, S: int, N: int) -> int:
     print(f"=== windowed: ws = WindowedSum(a + b, N={N}) ===")
     f = build_func_windowed(N)
@@ -141,6 +175,8 @@ def main() -> int:
     rc = 0
     rc |= run_elemwise(args.target, args.time_length, args.num_stocks)
     print()
+    rc |= run_libdevice(args.target, args.time_length, args.num_stocks)
+    print()
     rc |= run_windowed(args.target, args.time_length, args.num_stocks, args.window)
     return rc
 
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index bb2e6c9..d4d5af2 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -54,6 +54,7 @@ def main() -> int:
 
     import kun_mlir
     import cupy as cp
+    from KunQuant.jit.cuda import find_cuda_toolkit
 
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
@@ -63,7 +64,8 @@ def main() -> int:
     exe = kun_mlir.compile(mod,
                             graph_inputs=["a", "b", "c"],
                             graph_outputs=["out"],
-                            gpu_arch=args.target, opt_level=3)
+                            gpu_arch=args.target, opt_level=3,
+                            toolkit_path=find_cuda_toolkit())
 
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index 0545023..020fc0a 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -83,6 +83,7 @@ def run_one(N: int, expected_placement: str, target: str,
               T: int = 64, S: int = 2048) -> int:
     import kun_mlir
     import cupy as cp
+    from KunQuant.jit.cuda import find_cuda_toolkit
 
     print(f"=== N = {N}  ({expected_placement} temp buffer) ===")
     assert_planning(N, warps_per_cta, smem_size, expected_placement)
@@ -92,7 +93,8 @@ def run_one(N: int, expected_placement: str, target: str,
     exe = kun_mlir.compile(mod,
                             graph_inputs=["a", "b"],
                             graph_outputs=["out"],
-                            gpu_arch=target, opt_level=3)
+                            gpu_arch=target, opt_level=3,
+                            toolkit_path=find_cuda_toolkit())
     print(f"  kernels={exe.kernel_names}  warps_per_cta={exe.warps_per_cta}  "
            f"vector_size={exe.vector_size}  cubin={len(exe.cubin)} bytes")
 

From eab54af57721ffa02652a3b5e57721fdd0f7249b Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 8 May 2026 01:08:37 -0700
Subject: [PATCH 13/59] backref, fast windowed sum

---
 KunQuant/jit/cuda.py                 |   4 -
 KunQuant/passes/CodegenMLIR.py       |   6 +
 mlir/include/KunIr/KunIrOps.td       |  55 ++++++++
 mlir/lib/KunGpu/KunGpuToLLVM.cpp     | 187 +++++++++++++++++++++++++++
 mlir/lib/KunIr/KunIrOps.cpp          |  48 +++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp     |  32 +++++
 mlir/lib/Python/IRBuilder.cpp        |  19 +++
 mlir/test/python/test_kun_to_cuda.py | 107 +++++++++++++++
 8 files changed, 454 insertions(+), 4 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index b1bae78..7a81bf1 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -125,10 +125,6 @@ def _gpu_pass_options(cfg: CudaCompilerConfig) -> dict:
         # don't map cleanly onto the GPU primitives we lower today.
         # Keep it off until the corresponding kunir lowerings exist.
         "no_fast_stat":   True,
-        # opt_reduce rewrites WindowedSum etc. into the stateful
-        # FastWindowedSum op, which kunir doesn't have a counterpart
-        # for yet — keep the canonical ForeachBackWindow + Reduce shape.
-        "opt_reduce":     False,
     }
     if cfg.options:
         opts.update(cfg.options)
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 7e24066..938f8d5 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -30,6 +30,7 @@
 from KunQuant.ops.ReduceOp import (
     ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
 )
+from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
 from KunQuant.Stage import Function
 
 
@@ -91,6 +92,11 @@ def _emit_simple(op: OpBase, ir, val_map: Dict[OpBase, object]):
     if isinstance(op, WindowedTempOutput):
         return ir.windowed_output(val_map[op.inputs[0]],
                                     int(op.attrs["window"]))
+    if isinstance(op, BackRef):
+        return ir.back_ref(val_map[op.inputs[0]], int(op.attrs["window"]))
+    if isinstance(op, FastWindowedSum):
+        return ir.fast_windowed_sum(val_map[op.inputs[0]],
+                                      int(op.attrs["window"]))
     raise NotImplementedError(
         f"CodegenMLIR: op type {cls.__name__} is not supported by the "
         f"GPU backend yet (op = {op})")
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index e07572f..4dcd4d2 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -221,6 +221,61 @@ def KunIr_ReduceMinOp : KunIr_ReduceOp<"reduce_min"> {
   let summary = "Minimum reduction over the back window";
 }
 
+//===----------------------------------------------------------------------===//
+// BackRef — read input value at t - window
+//
+// At time step t, returns input[t - window]; semantically NaN when t < window
+// (the upstream pipeline writes the high time-domain region first, so cells
+// before t = window-1 should be ignored by the user).
+//
+// The op carries through `kunir-to-kungpu` in two forms:
+//   * Source kunir:  result = !kunir.ts<T, 1>
+//   * Lowered form:  result = T (scalar) — a single ts.get on the ts handle
+//
+// Constraints:
+//   * input.maxLookback >= window + 1   (or inf — function args)
+//   * window > 0
+//   * if result is ts: must be ts<input.elemType, 1>
+//   * if result is scalar: must equal input.elemType
+//===----------------------------------------------------------------------===//
+
+def KunIr_BackRefOp : KunIr_Op<"back_ref", [Pure]> {
+  let summary = "Reference the input value `window` time steps in the past";
+  let arguments = (ins KunIr_AnyTs:$input, I64Attr:$window);
+  let results = (outs AnyType:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `[` `window` `=` $window `]` `:` type($input) `->` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// FastWindowedSum — running sum over the trailing `window` time steps
+//
+// Equivalent to KunQuant's CPU `FastWindowedSum<T, stride, window>` (see
+// cpp/Kun/Ops.hpp:325).  Per time step:
+//
+//     v_t = v_{t-1}                              (init 0, Kahan-corrected)
+//         - input[t-window]   if input[t-window] not NaN
+//         + input[t]          if input[t]        not NaN
+//     out_t = (num_nans == 0) ? v_t : NaN
+//
+// The op is preserved past `kunir-to-kungpu` (with ts→scalar result rewrite)
+// because its 4-element per-thread state (v, compAdd, compSub, num_nans) is
+// most naturally materialised as LLVM allocas in the kungpu-to-llvm pass.
+// See FastWindowedSumPattern there.
+//
+// Constraints (same as BackRef): input.maxLookback >= window + 1, window > 0.
+//===----------------------------------------------------------------------===//
+
+def KunIr_FastWindowedSumOp : KunIr_Op<"fast_windowed_sum", [Pure]> {
+  let summary = "Stateful rolling-window sum with Kahan compensation + NaN tracking";
+  let arguments = (ins KunIr_AnyTs:$input, I64Attr:$window);
+  let results = (outs AnyType:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `[` `window` `=` $window `]` `:` type($input) `->` type($result) attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // FuncOp — function with named I/O and hardware target spec
 //
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 175a4aa..88cd177 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -499,6 +499,191 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// FastWindowedSum — running sum with Kahan compensation + NaN tracking.
+//
+// Per-thread state (4 cells, alloca'd at function entry, promoted to
+// registers by mem2reg):
+//   v             — running sum                                    (T)
+//   compAdd       — Kahan compensation for the +cur step           (T)
+//   compSub       — Kahan compensation for the -old step           (T)
+//   numNans       — count of NaNs currently inside the trailing-N window (i32)
+//
+// Algorithm — direct port of cpp/Kun/Ops.hpp::FastWindowedSum::step:
+//
+//   cur = input[t]                                                 ts.get  off=0
+//   old = (t >= window) ? input[t - window] : NaN                  ts.get  off=window  (guarded)
+//   old_is_nan = isnan(old)
+//   new_is_nan = isnan(cur)
+//   v = old_is_nan ? v : kahanAdd(v, -old, &compSub)               // subtract old
+//   v = new_is_nan ? v : kahanAdd(v, +cur, &compAdd)               // add cur
+//   numNans += (new_is_nan ? 1 : 0) - (old_is_nan ? 1 : 0)
+//   out = (numNans == 0) ? v : NaN
+//
+// The `t >= window` guard on `old` matches CPU's
+// `windowedRef`/`getWindow` which return NaN for index < window.
+// Without it, a function-arg gmem load at offset > t can fall before the
+// allocation start and segfault on some drivers.
+//===----------------------------------------------------------------------===//
+
+struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(FastWindowedSumOp op, OpAdaptor /*adaptor*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto idxTy   = rewriter.getIndexType();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+
+    auto resultTy = op.getResult().getType();
+    auto floatTy  = llvm::dyn_cast<FloatType>(resultTy);
+    if (!floatTy)
+      return rewriter.notifyMatchFailure(
+          op, "fast_windowed_sum result must be a scalar float "
+              "(post kunir-to-kungpu lowering)");
+
+    int64_t window = op.getWindow();
+    Value origInput = op.getInput();
+
+    // ── 1. Allocate state at function entry + initialise. ──────────
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    if (!fn)
+      return rewriter.notifyMatchFailure(
+          op, "fast_windowed_sum must be inside a gpu.func");
+
+    Value vPtr, addPtr, subPtr, nansPtr;
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      Block &entry = fn.getBody().front();
+      rewriter.setInsertionPointToStart(&entry);
+      Value c1_i32 = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value zeroF = rewriter.create<LLVM::ConstantOp>(
+          loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
+      Value windowI32 = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(window));
+
+      vPtr    = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, floatTy, c1_i32);
+      addPtr  = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, floatTy, c1_i32);
+      subPtr  = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, floatTy, c1_i32);
+      nansPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, i32Ty,   c1_i32);
+
+      rewriter.create<LLVM::StoreOp>(loc, zeroF,     vPtr);
+      rewriter.create<LLVM::StoreOp>(loc, zeroF,     addPtr);
+      rewriter.create<LLVM::StoreOp>(loc, zeroF,     subPtr);
+      rewriter.create<LLVM::StoreOp>(loc, windowI32, nansPtr);
+    }
+
+    // ── 2. Read cur (off=0) and old (off=window, guarded). ─────────
+    Value zeroOff   = rewriter.create<arith::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value windowOff = rewriter.create<arith::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(window));
+    Value cur = rewriter.create<TsGetOp>(loc, floatTy, origInput, zeroOff);
+
+    Value timeIdx = getCurrentTimeIdx(op);
+    if (!timeIdx)
+      return rewriter.notifyMatchFailure(
+          op, "fast_windowed_sum must be inside a scf.for time loop");
+    Value windowIdx  = rewriter.create<arith::ConstantIndexOp>(loc, window);
+    Value tGeWindow  = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, timeIdx, windowIdx);
+
+    auto ifOp = rewriter.create<scf::IfOp>(
+        loc, TypeRange{floatTy}, tGeWindow, /*withElseRegion=*/true);
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      Value loaded =
+          rewriter.create<TsGetOp>(loc, floatTy, origInput, windowOff);
+      rewriter.create<scf::YieldOp>(loc, loaded);
+    }
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
+      Value nanV = rewriter.create<LLVM::ConstantOp>(
+          loc, floatTy,
+          rewriter.getFloatAttr(
+              floatTy, std::numeric_limits<double>::quiet_NaN()));
+      rewriter.create<scf::YieldOp>(loc, nanV);
+    }
+    Value old = ifOp.getResult(0);
+
+    // ── 3. Algorithm step.  All arith is via LLVM ops at this phase. ──
+    auto fcmp_isnan = [&](Value x) {
+      // isnan(x) ⇔ x != x  (UNE catches NaN, == NaN is false)
+      return rewriter.create<LLVM::FCmpOp>(loc, LLVM::FCmpPredicate::une, x, x);
+    };
+    Value oldIsNan = fcmp_isnan(old);
+    Value newIsNan = fcmp_isnan(cur);
+
+    // Loaded state.
+    Value v       = rewriter.create<LLVM::LoadOp>(loc, floatTy, vPtr);
+    Value compAdd = rewriter.create<LLVM::LoadOp>(loc, floatTy, addPtr);
+    Value compSub = rewriter.create<LLVM::LoadOp>(loc, floatTy, subPtr);
+    Value numNans = rewriter.create<LLVM::LoadOp>(loc, i32Ty,   nansPtr);
+
+    Value zeroF = rewriter.create<LLVM::ConstantOp>(
+        loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
+
+    // kahanAdd(isnan_small, sum, small, &comp):
+    //   y = small - comp;  t = sum + y;
+    //   newComp = (t - sum) - y;
+    //   comp = isnan_small ? comp : newComp;
+    //   return t
+    auto kahanAdd = [&](Value isnan_small, Value sum, Value small, Value &comp) {
+      Value y     = rewriter.create<LLVM::FSubOp>(loc, small, comp);
+      Value t     = rewriter.create<LLVM::FAddOp>(loc, sum, y);
+      Value tMs   = rewriter.create<LLVM::FSubOp>(loc, t, sum);
+      Value newC  = rewriter.create<LLVM::FSubOp>(loc, tMs, y);
+      comp = rewriter.create<LLVM::SelectOp>(loc, isnan_small, comp, newC);
+      return t;
+    };
+
+    // v -= old  (skip when old is NaN)
+    Value negOld = rewriter.create<LLVM::FSubOp>(loc, zeroF, old);
+    Value tSub   = kahanAdd(oldIsNan, v, negOld, compSub);
+    v = rewriter.create<LLVM::SelectOp>(loc, oldIsNan, v, tSub);
+
+    // v += cur  (skip when cur is NaN)
+    Value tAdd   = kahanAdd(newIsNan, v, cur, compAdd);
+    v = rewriter.create<LLVM::SelectOp>(loc, newIsNan, v, tAdd);
+
+    // numNans += (new_is_nan ? 1 : 0) - (old_is_nan ? 1 : 0)
+    Value oneI32  = rewriter.create<LLVM::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value zeroI32 = rewriter.create<LLVM::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value oldDelta = rewriter.create<LLVM::SelectOp>(
+        loc, oldIsNan, oneI32, zeroI32);
+    Value newDelta = rewriter.create<LLVM::SelectOp>(
+        loc, newIsNan, oneI32, zeroI32);
+    numNans = rewriter.create<LLVM::SubOp>(loc, numNans, oldDelta);
+    numNans = rewriter.create<LLVM::AddOp>(loc, numNans, newDelta);
+
+    // result = (numNans == 0) ? v : NaN
+    Value isFull = rewriter.create<LLVM::ICmpOp>(
+        loc, LLVM::ICmpPredicate::eq, numNans, zeroI32);
+    Value nanV = rewriter.create<LLVM::ConstantOp>(
+        loc, floatTy,
+        rewriter.getFloatAttr(floatTy,
+                                std::numeric_limits<double>::quiet_NaN()));
+    Value out = rewriter.create<LLVM::SelectOp>(loc, isFull, v, nanV);
+
+    // ── 4. Store back state. ────────────────────────────────────────
+    rewriter.create<LLVM::StoreOp>(loc, v,       vPtr);
+    rewriter.create<LLVM::StoreOp>(loc, compAdd, addPtr);
+    rewriter.create<LLVM::StoreOp>(loc, compSub, subPtr);
+    rewriter.create<LLVM::StoreOp>(loc, numNans, nansPtr);
+
+    rewriter.replaceOp(op, out);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Pass
 //===----------------------------------------------------------------------===//
@@ -538,6 +723,7 @@ struct ConvertKunGpuToLLVMPass
     target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
     target.addIllegalOp<WindowedTempOp, TsGetOp, TsPutOp,
                         TimeLengthOp, StockIdOp, BlockStockCountOp>();
+    target.addIllegalOp<kunir::FastWindowedSumOp>();
     // gpu.func is legal only after its signature has been converted from
     // (...kunir.ts) to (...!llvm.ptr) by the FunctionOpInterface pattern
     // we register below.
@@ -557,6 +743,7 @@ struct ConvertKunGpuToLLVMPass
         typeConv, ctx);
     patterns.add<WindowedTempPattern>(typeConv, ctx, descMap, smemCounter);
     patterns.add<TsGetPattern, TsPutPattern>(typeConv, ctx, descMap);
+    patterns.add<FastWindowedSumPattern>(typeConv, ctx);
 
     if (failed(applyPartialConversion(module, target, std::move(patterns))))
       signalPassFailure();
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 638ae4f..2dd3402 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -117,6 +117,54 @@ LogicalResult ReduceMulOp::verify() { return verifyInsideForEachBackWindow(*this
 LogicalResult ReduceMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
 
+//===----------------------------------------------------------------------===//
+// BackRef + FastWindowedSum — share a verifier (same shape / constraints)
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+verifyWindowedScalarOrTsResultOp(Operation *op, Value input, int64_t window,
+                                  Type resultTy) {
+  auto inputTy = llvm::cast<TsType>(input.getType());
+  if (window <= 0)
+    return op->emitOpError("window must be positive, got ") << window;
+
+  // Need both the current value and the value `window` steps back, so the
+  // input must retain at least `window + 1` time steps.
+  uint64_t need = static_cast<uint64_t>(window) + 1;
+  uint64_t have = inputTy.getMaxLookback();
+  if (have != kInfLookback && have < need)
+    return op->emitOpError("input.maxLookback (")
+           << have << ") must be >= window+1 (" << need << ")";
+
+  // Result type: either ts<inputElemType, 1> (source form) or the input's
+  // element type itself (lowered form, after kunir-to-kungpu).
+  Type elemTy = inputTy.getElementType();
+  if (auto resTs = llvm::dyn_cast<TsType>(resultTy)) {
+    if (resTs.getElementType() != elemTy)
+      return op->emitOpError("result element type '")
+             << resTs.getElementType()
+             << "' must match input element type '" << elemTy << "'";
+    if (resTs.getMaxLookback() != 1)
+      return op->emitOpError("result maxLookback must be 1, got ")
+             << resTs.getMaxLookback();
+    return success();
+  }
+  if (resultTy != elemTy)
+    return op->emitOpError(
+               "scalar result type must equal input element type '")
+           << elemTy << "', got '" << resultTy << "'";
+  return success();
+}
+
+LogicalResult BackRefOp::verify() {
+  return verifyWindowedScalarOrTsResultOp(*this, getInput(), getWindow(),
+                                            getResult().getType());
+}
+LogicalResult FastWindowedSumOp::verify() {
+  return verifyWindowedScalarOrTsResultOp(*this, getInput(), getWindow(),
+                                            getResult().getType());
+}
+
 //===----------------------------------------------------------------------===//
 // ForEachBackWindowOp — verifier + custom assembly format
 //
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 0e7316c..785912f 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -318,6 +318,38 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       return success();
     }
 
+    // back_ref → ts.get(handle, offset = window).  Stateless, so we can
+    // fully lower here (the op does not survive into the kungpu IR).
+    if (auto br = dyn_cast<BackRefOp>(op)) {
+      auto inputTs = llvm::cast<TsType>(br.getInput().getType());
+      auto inputIt = tsMap.find(br.getInput());
+      if (inputIt == tsMap.end() || inputIt->second.kind != TsKind::Handle)
+        return op.emitError(
+            "kunir-to-kungpu: back_ref input must be a ts handle");
+      Value offset = fb.create<arith::ConstantOp>(
+          ol, fb.getI32Type(), fb.getI32IntegerAttr(br.getWindow()));
+      Value scalar = fb.create<TsGetOp>(ol, inputTs.getElementType(),
+                                          inputIt->second.value, offset);
+      tsMap[br.getResult()] = {TsKind::Scalar, scalar};
+      return success();
+    }
+
+    // fast_windowed_sum → preserved as a kunir op with scalar result and
+    // ts-handle input.  The kungpu-to-llvm pass owns the actual lowering
+    // (per-thread state allocas + the Kahan-corrected step).
+    if (auto fws = dyn_cast<FastWindowedSumOp>(op)) {
+      auto inputTs = llvm::cast<TsType>(fws.getInput().getType());
+      auto inputIt = tsMap.find(fws.getInput());
+      if (inputIt == tsMap.end() || inputIt->second.kind != TsKind::Handle)
+        return op.emitError(
+            "kunir-to-kungpu: fast_windowed_sum input must be a ts handle");
+      auto newOp = fb.create<FastWindowedSumOp>(
+          ol, /*resultType=*/inputTs.getElementType(),
+          /*input=*/inputIt->second.value, fws.getWindowAttr());
+      tsMap[fws.getResult()] = {TsKind::Scalar, newOp.getResult()};
+      return success();
+    }
+
     if (isa<CsRankOp>(op)) {
       return op.emitError("kunir-to-kungpu: cs_rank lowering not yet implemented");
     }
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 8df163c..1ad87b4 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -150,6 +150,19 @@ class IRBuilder {
                                                 length);
   }
 
+  // ── Back-reference + Fast windowed sum (high-level: ts → ts<T,1>) ─
+  Value backRefOp(Value x, int64_t window) {
+    auto inTs = llvm::cast<kunir::TsType>(x.getType());
+    auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(), 1);
+    return b_.create<kunir::BackRefOp>(b_.getUnknownLoc(), resultTy, x, window);
+  }
+  Value fastWindowedSumOp(Value x, int64_t window) {
+    auto inTs = llvm::cast<kunir::TsType>(x.getType());
+    auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(), 1);
+    return b_.create<kunir::FastWindowedSumOp>(b_.getUnknownLoc(), resultTy, x,
+                                                 window);
+  }
+
   // ── For-each-back-window region ───────────────────────────────────
   std::vector<Value>
   beginForEachBackWindow(std::vector<Value> inputs, int64_t window,
@@ -305,6 +318,12 @@ void registerIRBuilder(py::module &m) {
       .def("windowed_output", &IRBuilder::windowedOutputOp,
             py::arg("x"), py::arg("length"))
 
+      // Back-reference + Fast windowed sum
+      .def("back_ref",          &IRBuilder::backRefOp,
+            py::arg("x"), py::arg("window"))
+      .def("fast_windowed_sum", &IRBuilder::fastWindowedSumOp,
+            py::arg("x"), py::arg("window"))
+
       // Loop
       .def("begin_for_each_back_window", &IRBuilder::beginForEachBackWindow,
             py::arg("inputs"), py::arg("window"), py::arg("result_types"),
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index bd51656..ae7a3e0 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -28,6 +28,7 @@
 
 from KunQuant.Op import Builder, Input, Output
 from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum
+from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
 from KunQuant.Stage import Function
 from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
 
@@ -66,6 +67,28 @@ def build_func_windowed(N: int) -> Function:
     return Function(builder.ops, name="windowed_kernel")
 
 
+def build_func_backref(N: int) -> Function:
+    """out = BackRef(a + b, N) — value of (a+b) at time t-N"""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        Output(BackRef(Add(a, bin_), N), "out")
+    return Function(builder.ops, name="backref_kernel")
+
+
+def build_func_fastwindowedsum(N: int) -> Function:
+    """ws = FastWindowedSum(a + b, N) — same windowed-sum semantics as
+    WindowedSum, but uses the stateful Kahan-corrected algorithm from
+    cpp/Kun/Ops.hpp."""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        Output(FastWindowedSum(Add(a, bin_), N), "ws")
+    return Function(builder.ops, name="fastwindowedsum_kernel")
+
+
 def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
               atol: float = 1e-5) -> int:
     """Compile a Function, launch it, validate against numpy."""
@@ -117,6 +140,85 @@ def run_libdevice(target: str, T: int, S: int) -> int:
                      target, T, S, atol=1e-4)
 
 
+def run_backref(target: str, T: int, S: int, N: int) -> int:
+    print(f"=== backref: out = (a+b)[t - {N}] ===")
+    f = build_func_backref(N)
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    mod = to_mlir(build_func_backref(N), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(2)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "out": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+
+    # Reference: out[t] = (a+b)[t-N] for t >= N; undefined for t < N.
+    c = a_h + b_h
+    diff = np.abs(out_h[N:] - c[: T - N])
+    max_abs = float(diff.max())
+    if max_abs > 1e-5:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max abs diff {max_abs} at {idx}", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} on the {T - N} valid time steps")
+    return 0
+
+
+def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
+    print(f"=== fast_windowed_sum: ws = FastWindowedSum(a + b, N={N}) ===")
+    f = build_func_fastwindowedsum(N)
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    mod = to_mlir(build_func_fastwindowedsum(N), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(3)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "ws": out})
+    cp.cuda.runtime.deviceSynchronize()
+    out_h = cp.asnumpy(out)
+
+    # Reference matches WindowedSum (same window, no NaN inputs).
+    c = a_h + b_h
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)
+    expected = np.empty((T, S), dtype=np.float32)
+    expected[:N - 1] = np.nan
+    expected[N - 1] = cumsum[N - 1]
+    if T > N:
+        expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+
+    diff = np.abs(out_h[N - 1:] - expected[N - 1:])
+    max_abs = float(diff.max())
+    atol = max(1e-3, 5e-7 * N)
+    if max_abs > atol:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e} at {idx}",
+                file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e})")
+    return 0
+
+
 def run_windowed(target: str, T: int, S: int, N: int) -> int:
     print(f"=== windowed: ws = WindowedSum(a + b, N={N}) ===")
     f = build_func_windowed(N)
@@ -178,6 +280,11 @@ def main() -> int:
     rc |= run_libdevice(args.target, args.time_length, args.num_stocks)
     print()
     rc |= run_windowed(args.target, args.time_length, args.num_stocks, args.window)
+    print()
+    rc |= run_backref(args.target, args.time_length, args.num_stocks, args.window)
+    print()
+    rc |= run_fastwindowedsum(args.target, args.time_length, args.num_stocks,
+                                args.window)
     return rc
 
 

From ac21bdd594458cb19b48e581d4aba6f7fa61c857 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 8 May 2026 01:43:57 -0700
Subject: [PATCH 14/59] rename library

---
 KunQuant/jit/cuda.py                   | 22 +++++++++++-----------
 KunQuant/passes/CodegenMLIR.py         |  8 ++++----
 mlir/lib/KunCuda/CMakeLists.txt        |  6 +++---
 mlir/lib/Python/CMakeLists.txt         | 17 ++++++++++-------
 mlir/lib/Python/IRBuilder.cpp          |  8 ++++----
 mlir/lib/Python/IRBuilder.h            |  4 ++--
 mlir/lib/Python/MlirBinding.cpp        | 18 +++++++++---------
 mlir/lib/Python/PyModule.cpp           |  2 +-
 mlir/test/python/test_kun_mlir.py      | 14 +++++++-------
 mlir/test/python/test_kun_to_cuda.py   |  2 +-
 mlir/test/python/test_multi_kernel.py  |  6 +++---
 mlir/test/python/test_windowed_temp.py |  6 +++---
 12 files changed, 58 insertions(+), 55 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 7a81bf1..7d42a52 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -1,7 +1,7 @@
 """GPU JIT entry point for KunQuant.
 
 Mirror of `KunQuant.jit.cfake.compileit` but targets a CUDA backend
-through the kun_mlir / kunir pipeline.  Reuses the existing Driver pass
+through the KunMLIR / kunir pipeline.  Reuses the existing Driver pass
 list (`Driver.optimize`) so any IR rewrites the CPU path benefits from
 also apply here — only the codegen layer is replaced.
 
@@ -25,7 +25,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import kun_mlir
+from KunQuant.jit import KunMLIR
 
 from KunQuant.Driver import optimize
 from KunQuant.Stage import Function
@@ -139,8 +139,8 @@ def _to_dtype_token(dtype: str) -> str:
                        f"lowers float on GPU)")
 
 
-def compileit(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.Executable:
-    """Compile a single KunQuant Function to a GPU `kun_mlir.Executable`.
+def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
+    """Compile a single KunQuant Function to a GPU `KunMLIR.Executable`.
 
     The Function is mutated in place by Driver.optimize() (same as the
     CPU path).  Inputs/Outputs declared via `Input(name)` / `Output(...,
@@ -161,18 +161,18 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.Executable:
     options = _gpu_pass_options(cfg)
     optimize(f, options)
 
-    # 2.  Translate the post-optimize IR to a kun_mlir module.
+    # 2.  Translate the post-optimize IR to a KunMLIR module.
     target = TargetSpec(occupancy=cfg.occupancy,
                           warps_per_cta=cfg.warps_per_cta,
                           smem_size=cfg.smem_size,
                           vector_size=cfg.vector_size)
-    ir = kun_mlir.IRBuilder()
+    ir = KunMLIR.IRBuilder()
     in_names, out_names = translate_function(
         f, target, ir, dtype=_to_dtype_token(cfg.dtype))
     mod = ir.finish()
 
-    # 3.  Hand off to the kun_mlir compile pipeline.
-    return kun_mlir.compile(
+    # 3.  Hand off to the KunMLIR compile pipeline.
+    return KunMLIR.compile(
         mod,
         graph_inputs=in_names,
         graph_outputs=out_names,
@@ -182,15 +182,15 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.Executable:
     )
 
 
-def to_mlir(f: Function, cfg: CudaCompilerConfig) -> kun_mlir.ModuleOp:
+def to_mlir(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
     """Run the same passes + translator as `compileit`, but return the
-    kun_mlir module before PTX/CUBIN.  Useful for debugging the IR."""
+    KunMLIR module before PTX/CUBIN.  Useful for debugging the IR."""
     options = _gpu_pass_options(cfg)
     optimize(f, options)
     target = TargetSpec(occupancy=cfg.occupancy,
                           warps_per_cta=cfg.warps_per_cta,
                           smem_size=cfg.smem_size,
                           vector_size=cfg.vector_size)
-    ir = kun_mlir.IRBuilder()
+    ir = KunMLIR.IRBuilder()
     translate_function(f, target, ir, dtype=_to_dtype_token(cfg.dtype))
     return ir.finish()
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 938f8d5..a6dcae8 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -1,9 +1,9 @@
-"""Translate a (post-optimize) KunQuant Function into a kun_mlir module
+"""Translate a (post-optimize) KunQuant Function into a KunMLIR module
 holding a single kunir.func inside a gpu.module.
 
 This is the GPU-side counterpart to passes.CodegenCpp.codegen_cpp; it
 runs after the same Driver.optimize() pipeline the CPU path uses, then
-walks the lowered IR and emits kunir ops via the kun_mlir.IRBuilder
+walks the lowered IR and emits kunir ops via the KunMLIR.IRBuilder
 pybind class.
 
 Scope (v0): only the ops kunir currently supports.
@@ -119,10 +119,10 @@ def _emit_reduction(op: ReductionOp, ir, val_map: Dict[OpBase, object]):
 
 def translate_function(f: Function, target: TargetSpec, ir,
                         dtype: str = "f32"):
-    """Emit `f` as a single kunir.func into the open `ir` (kun_mlir.IRBuilder).
+    """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
 
     Returns the list of (input_name, output_name) declared on the func,
-    so the caller can pass them straight to kun_mlir.compile() as
+    so the caller can pass them straight to KunMLIR.compile() as
     graph_inputs / graph_outputs.
     """
     # 1.  Boundary ops in topo order — the kunir.func's I/O.
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 7e1b995..245d10c 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -7,17 +7,17 @@ add_library(KunCudaRuntime SHARED Runtime.cpp)
 
 # Project-wide compile flags set -fvisibility=hidden + inlines-hidden to
 # minimise the size of MLIR static libs.  This shared runtime needs to
-# export its public class methods so downstream .so's (kun_mlir, host
+# export its public class methods so downstream .so's (KunMLIR, host
 # runners, …) can resolve them at load time.
 #
-# We also put the .so next to the kun_mlir Python module, mirroring the
+# We also put the .so next to the KunMLIR Python module, mirroring the
 # existing project pattern (INSTALL_RPATH=$ORIGIN at the top level): all
 # co-distributed shared libs live in one directory and find each other
 # as siblings.
 set_target_properties(KunCudaRuntime PROPERTIES
     CXX_VISIBILITY_PRESET default
     VISIBILITY_INLINES_HIDDEN OFF
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/mlir/python")
+    LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/KunQuant/jit")
 
 target_include_directories(KunCudaRuntime PUBLIC
     "${PROJECT_SOURCE_DIR}/mlir/include")
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index 3372a56..b19012c 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -10,19 +10,22 @@
 string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
 string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
 
-pybind11_add_module(kun_mlir SHARED
+pybind11_add_module(KunMLIR SHARED
   MlirBinding.cpp
   IRBuilder.cpp
   PyModule.cpp
 )
 
-# Co-locate the binding with libKunCudaRuntime.so so $ORIGIN
-# (CMAKE_INSTALL_RPATH set at top level) resolves dependencies as
-# siblings — same pattern as KunRunner ↔ KunRuntime.
-set_target_properties(kun_mlir PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/mlir/python")
+# Drop the .so directly into the source tree's KunQuant/jit/ so it
+# imports as `KunQuant.jit.KunMLIR` (alongside cuda.py) without any
+# PYTHONPATH gymnastics.  libKunCudaRuntime.so is co-located there too
+# (see mlir/lib/KunCuda/CMakeLists.txt) so the $ORIGIN rpath resolves
+# the sibling at load time — same pattern KunRunner ↔ KunRuntime use
+# under KunQuant/runner/.
+set_target_properties(KunMLIR PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/KunQuant/jit")
 
-target_link_libraries(kun_mlir PRIVATE
+target_link_libraries(KunMLIR PRIVATE
   # Compiler side
   MLIRKunIrDialect
   MLIRKunGpuDialect
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 1ad87b4..03d9c43 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -41,7 +41,7 @@ class IRBuilder {
     Location loc = b_.getUnknownLoc();
     pm_->module = OwningOpRef<ModuleOp>(ModuleOp::create(loc));
     b_.setInsertionPointToEnd(pm_->module.get().getBody());
-    // One gpu.module per IRBuilder — kun_mlir's pipeline expects exactly
+    // One gpu.module per IRBuilder — KunMLIR's pipeline expects exactly
     // one container for all kunir.func ops.
     gpuMod_ = b_.create<gpu::GPUModuleOp>(loc, "kungpu_kernels");
     b_.setInsertionPointToStart(&gpuMod_.getBodyRegion().front());
@@ -272,11 +272,11 @@ void registerIRBuilder(py::module &m) {
   // identity / repr.  They live as long as the IRBuilder + its resulting
   // PyModule.
   py::class_<Value>(m, "Value")
-      .def("__repr__", [](Value v) { return "<kun_mlir.Value " + valueRepr(v) + ">"; })
+      .def("__repr__", [](Value v) { return "<KunMLIR.Value " + valueRepr(v) + ">"; })
       .def("__str__",  [](Value v) { return valueRepr(v); });
 
   py::class_<Type>(m, "Type")
-      .def("__repr__", [](Type t) { return "<kun_mlir.Type " + typeRepr(t) + ">"; })
+      .def("__repr__", [](Type t) { return "<KunMLIR.Type " + typeRepr(t) + ">"; })
       .def("__str__",  [](Type t) { return typeRepr(t); });
 
   py::class_<IRBuilder>(m, "IRBuilder",
@@ -345,7 +345,7 @@ void registerIRBuilder(py::module &m) {
             "Print the module under construction (for debugging — does "
             "not consume the builder).")
       .def("finish", &IRBuilder::finish,
-            "Hand off the module to a kun_mlir.ModuleOp.  Builder is "
+            "Hand off the module to a KunMLIR.ModuleOp.  Builder is "
             "consumed.");
 }
 
diff --git a/mlir/lib/Python/IRBuilder.h b/mlir/lib/Python/IRBuilder.h
index b8c17ae..6fcda73 100644
--- a/mlir/lib/Python/IRBuilder.h
+++ b/mlir/lib/Python/IRBuilder.h
@@ -4,13 +4,13 @@
 // codegen pass) can emit kunir ops without going through textual MLIR.
 //
 // Lifecycle:
-//   ir = kun_mlir.IRBuilder()
+//   ir = KunMLIR.IRBuilder()
 //   ir.begin_func(name, in_types, in_names, out_names, target_spec, result_types)
 //   args = ir.func_args
 //   v = ir.add(args[0], args[1])
 //   ir.end_func([v])
 //   ...
-//   mod = ir.finish()                # → kun_mlir.ModuleOp
+//   mod = ir.finish()                # → KunMLIR.ModuleOp
 //
 // `Value` and `Type` are opaque wrappers around mlir::Value / mlir::Type.
 // They are valid only while the IRBuilder (and the resulting PyModule)
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index c2b54a1..46ee98e 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -1,10 +1,10 @@
 //===- MlirBinding.cpp - Python bindings for the kunir → cubin flow ----===//
 //
 // Exposes:
-//   kun_mlir.parse(text)            → ModuleOp     (loads MLIR text)
+//   KunMLIR.parse(text)            → ModuleOp     (loads MLIR text)
 //   ModuleOp.to_string() / __str__  → str          (dumps the module)
-//   kun_mlir.lower_to_ptx(mod, …)   → str          (kunir → PTX, debug only)
-//   kun_mlir.compile(mod, …)        → Executable   (kunir → loadable kernel)
+//   KunMLIR.lower_to_ptx(mod, …)   → str          (kunir → PTX, debug only)
+//   KunMLIR.compile(mod, …)        → Executable   (kunir → loadable kernel)
 //   Executable.launch({name: cupy}) → None         (cuLaunchKernel + sync)
 //
 // `compile` is the main path; `lower_to_ptx` is for inspecting the
@@ -55,7 +55,7 @@ static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
 
   std::string ptx;
   if (failed(kungpu::compileKunIrToPtx(pm.module.get(), opts, ptx)))
-    throw std::runtime_error("kun_mlir.lower_to_ptx failed");
+    throw std::runtime_error("KunMLIR.lower_to_ptx failed");
   return ptx;
 }
 
@@ -167,10 +167,10 @@ pyCompile(PyModule &pm,
             const std::string &toolkitPath) {
   if (graphInputs.empty())
     throw std::runtime_error(
-        "kun_mlir.compile: graph_inputs cannot be empty");
+        "KunMLIR.compile: graph_inputs cannot be empty");
   if (graphOutputs.empty())
     throw std::runtime_error(
-        "kun_mlir.compile: graph_outputs cannot be empty");
+        "KunMLIR.compile: graph_outputs cannot be empty");
 
   kungpu::PtxCompileOptions opts;
   if (!gpuArch.empty())        opts.targetCpu      = gpuArch;
@@ -181,7 +181,7 @@ pyCompile(PyModule &pm,
 
   kun_cuda::ExecutableData data;
   if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), opts, data)))
-    throw std::runtime_error("kun_mlir.compile failed");
+    throw std::runtime_error("KunMLIR.compile failed");
   // Graph topology is a runtime concern — fill it in here, just before
   // handing off to Executable's ctor (which validates + plans).
   data.graphInputs  = graphInputs;
@@ -191,7 +191,7 @@ pyCompile(PyModule &pm,
 
 } // namespace
 
-PYBIND11_MODULE(kun_mlir, m) {
+PYBIND11_MODULE(KunMLIR, m) {
   m.doc() = "Bindings for the KunQuant MLIR compiler (kunir → PTX → CUBIN "
              "→ launch).";
 
@@ -203,7 +203,7 @@ PYBIND11_MODULE(kun_mlir, m) {
             "Return the textual MLIR form of the module.")
       .def("__str__",  &PyModule::toString)
       .def("__repr__", [](const PyModule &m) {
-        return "<kun_mlir.ModuleOp>\n" + m.toString();
+        return "<KunMLIR.ModuleOp>\n" + m.toString();
       });
 
   m.def("parse", &PyModule::parse, py::arg("text"),
diff --git a/mlir/lib/Python/PyModule.cpp b/mlir/lib/Python/PyModule.cpp
index 7089ff9..38ffe8b 100644
--- a/mlir/lib/Python/PyModule.cpp
+++ b/mlir/lib/Python/PyModule.cpp
@@ -86,7 +86,7 @@ std::unique_ptr<PyModule> PyModule::parse(const std::string &text) {
   auto pm = std::make_unique<PyModule>();
   pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
   if (!pm->module)
-    throw std::runtime_error("kun_mlir.parse: failed to parse MLIR text");
+    throw std::runtime_error("KunMLIR.parse: failed to parse MLIR text");
   return pm;
 }
 
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 9cd8e08..2fbfaed 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""End-to-end test for the `kun_mlir` Python bindings.
+"""End-to-end test for the `KunMLIR` Python bindings.
 
   parse → to_string → lower_to_ptx (debug only) → compile → launch
 
@@ -36,18 +36,18 @@ def main() -> int:
     ap.add_argument("-S", "--num-stocks", type=int, default=2048)
     args = ap.parse_args()
 
-    import kun_mlir
+    from KunQuant.jit import KunMLIR
     import cupy as cp
     import numpy as np
     from KunQuant.jit.cuda import find_cuda_toolkit
 
     # Force-initialise the CUDA driver + create the primary context now,
-    # so subsequent kun_mlir.compile() / Executable.launch() find one.
+    # so subsequent KunMLIR.compile() / Executable.launch() find one.
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
 
     print(f"=== parse + to_string ===")
-    mod = kun_mlir.parse(SAMPLE_KUNIR)
+    mod = KunMLIR.parse(SAMPLE_KUNIR)
     text = mod.to_string()
     assert "kunir.func @test_addsum" in text, "module text missing kunir.func"
     print("ok — module round-trips through parse/to_string")
@@ -60,15 +60,15 @@ def main() -> int:
     # at PTX text via gpu-module-to-binary{format=isa}.  Mutates `mod`
     # (replaces the gpu.module with a gpu.binary), so we re-parse for
     # the main compile step below.
-    ptx = kun_mlir.lower_to_ptx(mod, gpu_arch=args.target, opt_level=3,
+    ptx = KunMLIR.lower_to_ptx(mod, gpu_arch=args.target, opt_level=3,
                                   toolkit_path=toolkit)
     assert "test_addsum" in ptx
     print(f"ok — produced {len(ptx)} bytes of PTX text")
 
     print()
     print(f"=== compile (all-in-one) ===")
-    mod2 = kun_mlir.parse(SAMPLE_KUNIR)
-    exe = kun_mlir.compile(mod2,
+    mod2 = KunMLIR.parse(SAMPLE_KUNIR)
+    exe = KunMLIR.compile(mod2,
                             graph_inputs=["a", "b"],
                             graph_outputs=["sum"],
                             gpu_arch=args.target, opt_level=3,
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index ae7a3e0..568d509 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -3,7 +3,7 @@
 
 Builds a KunQuant Function with the high-level Op API, runs the same
 Driver.optimize() pipeline the CPU compileit uses, then compiles to a
-CUDA Executable via kun_mlir and validates against numpy.
+CUDA Executable via KunMLIR and validates against numpy.
 
 Three factors are exercised:
   * elemwise:   out = (a + b) * a - b * b           (binary elemwise only;
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index d4d5af2..937cd79 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -52,7 +52,7 @@ def main() -> int:
     ap.add_argument("-S", "--num-stocks", type=int, default=2048)
     args = ap.parse_args()
 
-    import kun_mlir
+    from KunQuant.jit import KunMLIR
     import cupy as cp
     from KunQuant.jit.cuda import find_cuda_toolkit
 
@@ -60,8 +60,8 @@ def main() -> int:
     _ = cp.zeros((1,), dtype=cp.float32)
 
     print("=== compile two-kernel graph ===")
-    mod = kun_mlir.parse(SAMPLE_KUNIR)
-    exe = kun_mlir.compile(mod,
+    mod = KunMLIR.parse(SAMPLE_KUNIR)
+    exe = KunMLIR.compile(mod,
                             graph_inputs=["a", "b", "c"],
                             graph_outputs=["out"],
                             gpu_arch=args.target, opt_level=3,
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index 020fc0a..b5be1fb 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -81,7 +81,7 @@ def assert_planning(N: int, warps_per_cta: int, smem_size: int,
 def run_one(N: int, expected_placement: str, target: str,
               warps_per_cta: int = 4, smem_size: int = 49152,
               T: int = 64, S: int = 2048) -> int:
-    import kun_mlir
+    from KunQuant.jit import KunMLIR
     import cupy as cp
     from KunQuant.jit.cuda import find_cuda_toolkit
 
@@ -89,8 +89,8 @@ def run_one(N: int, expected_placement: str, target: str,
     assert_planning(N, warps_per_cta, smem_size, expected_placement)
 
     ir = build_ir(N, warps_per_cta=warps_per_cta, smem_size=smem_size)
-    mod = kun_mlir.parse(ir)
-    exe = kun_mlir.compile(mod,
+    mod = KunMLIR.parse(ir)
+    exe = KunMLIR.compile(mod,
                             graph_inputs=["a", "b"],
                             graph_outputs=["out"],
                             gpu_arch=target, opt_level=3,

From 9ab700fd287697ca7084e55dee7f42f3fc30235e Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 8 May 2026 02:09:00 -0700
Subject: [PATCH 15/59] add executor

---
 KunQuant/jit/cuda.py                   |  5 +-
 mlir/include/KunCuda/Runtime.h         | 68 +++++++++++++++++++++++---
 mlir/lib/KunCuda/Runtime.cpp           | 39 +++++++++++----
 mlir/lib/Python/MlirBinding.cpp        | 65 +++++++++++++++++++-----
 mlir/test/python/test_kun_mlir.py      | 10 ++--
 mlir/test/python/test_kun_to_cuda.py   | 21 +++++---
 mlir/test/python/test_multi_kernel.py  | 31 +++++++++---
 mlir/test/python/test_windowed_temp.py |  6 +--
 8 files changed, 194 insertions(+), 51 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 7d42a52..7b75435 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -7,10 +7,13 @@
 
 User entry point::
 
+    from KunQuant.jit import KunMLIR
     from KunQuant.jit.cuda import compileit, CudaCompilerConfig
 
     exe = compileit(f, CudaCompilerConfig(gpu_arch="sm_80"))
-    exe.launch({"a": cp_a, "b": cp_b, "out": cp_out})
+    executor = KunMLIR.Executor()                       # default stream
+    executor.runGraph(exe, {"a": cp_a, "b": cp_b, "out": cp_out})
+    executor.synchronize()
 
 Scope (v0):
   * Single Function in, single kunir.func out.  Multi-Function /
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index b8b8ee9..dd47090 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -41,8 +41,9 @@
 #include <vector>
 
 extern "C" {
-typedef struct CUmod_st  *CUmodule;
-typedef struct CUfunc_st *CUfunction;
+typedef struct CUmod_st    *CUmodule;
+typedef struct CUfunc_st   *CUfunction;
+typedef struct CUstream_st *CUstream;
 } // extern "C"
 
 namespace kun_cuda {
@@ -134,7 +135,9 @@ class Executable {
   /// runtime (after slot reuse).
   int  peakIntermediateSlots() const noexcept;
 
-  /// Launch every kernel in `launchOrder` on the default stream.
+  /// Launch every kernel in `launchOrder` asynchronously on `stream`.
+  /// **Does not synchronize** — the caller (typically `Executor::runGraph`
+  /// + `Executor::synchronize`) owns waiting for completion.
   ///
   /// `args` keys must equal `graphInputs ++ graphOutputs` (order
   /// doesn't matter; the runtime hashes them into the buffer table).
@@ -146,10 +149,11 @@ class Executable {
   ///   block_x = warps_per_cta * 32
   ///   grid_x  = ceil_div(numStocks, block_x * vector_size)
   ///
-  /// Synchronous: `cuCtxSynchronize` is called once after the last
-  /// kernel.  Throws std::runtime_error on validation or driver errors.
-  void launch(int64_t timeLength, int64_t numStocks,
-              const std::vector<std::pair<std::string, uintptr_t>> &args);
+  /// Throws std::runtime_error on validation or driver errors.  This is
+  /// a low-level entry point — most users go through `Executor::runGraph`.
+  void launchOnStream(int64_t timeLength, int64_t numStocks,
+                       const std::vector<std::pair<std::string, uintptr_t>> &args,
+                       CUstream stream);
 
 private:
   /// Allocate (or re-allocate, if shape changed) the intermediate slot
@@ -171,4 +175,54 @@ class Executable {
   int64_t cachedS_ = -1;
 };
 
+//===----------------------------------------------------------------------===//
+// Executor — wraps a CUDA stream and exposes the runGraph / synchronize
+// pair, mirroring the CPU `kun::Executor` shape.
+//
+// Default constructor uses the CUDA default (NULL) stream.  The
+// stream-injecting constructor lets callers reuse a stream they already
+// own (e.g. `cupy.cuda.Stream`'s `.ptr`); the Executor does NOT take
+// ownership and never destroys the stream.
+//
+// `runGraph` is asynchronous — it queues every kernel in the executable
+// onto this stream and returns immediately.  Call `synchronize` (or wait
+// on the stream by other means) before reading results back to host.
+//
+// Thread / Executable model: an `Executable`'s intermediate slot pool
+// is mutable state shared by every `runGraph` call against it.  Driving
+// the same Executable from two Executors concurrently is unsafe — pair
+// them 1:1, or serialize the calls externally.
+//===----------------------------------------------------------------------===//
+
+class Executor {
+public:
+  /// Use the CUDA default stream.
+  Executor();
+  /// Reuse a stream the caller owns (e.g. cupy's `.ptr`).  We do not
+  /// destroy it; lifetime is the caller's responsibility.
+  explicit Executor(CUstream stream);
+  ~Executor();
+
+  Executor(const Executor &)            = delete;
+  Executor &operator=(const Executor &) = delete;
+  Executor(Executor &&)                 = delete;
+  Executor &operator=(Executor &&)      = delete;
+
+  /// Queue all kernels in `exe` on this executor's stream.  Async — does
+  /// not synchronize.  Throws std::runtime_error on validation / driver
+  /// errors.
+  void runGraph(Executable &exe,
+                int64_t timeLength, int64_t numStocks,
+                const std::vector<std::pair<std::string, uintptr_t>> &args);
+
+  /// Block until all queued work on this stream completes.
+  void synchronize();
+
+  /// Raw stream handle (default-stream Executor returns nullptr).
+  CUstream stream() const noexcept { return stream_; }
+
+private:
+  CUstream stream_ = nullptr;
+};
+
 } // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index 65f0f6c..c998c19 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -455,16 +455,17 @@ int Executable::peakIntermediateSlots() const noexcept {
   return plan_->peakIntermediateSlots;
 }
 
-void Executable::launch(
+void Executable::launchOnStream(
     int64_t timeLength, int64_t numStocks,
-    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    CUstream stream) {
   // 1.  Shape sanity (kernel signature is i32 i32).
   if (timeLength > std::numeric_limits<int32_t>::max() ||
       numStocks  > std::numeric_limits<int32_t>::max() ||
       timeLength < 0 || numStocks < 0)
     throw std::runtime_error(
-        "kun_cuda::launch: time_length / num_stocks out of i32 range "
-        "(kernel signature uses i32, i32)");
+        "kun_cuda::launchOnStream: time_length / num_stocks out of i32 "
+        "range (kernel signature uses i32, i32)");
 
   // 2.  Allocate intermediate slot pool if needed.
   ensureSlotPool(timeLength, numStocks);
@@ -484,7 +485,7 @@ void Executable::launch(
       idx = itOut->second;
     else
       throw std::runtime_error(
-          "kun_cuda::launch: unexpected argument '" + kv.first +
+          "kun_cuda::launchOnStream: unexpected argument '" + kv.first +
           "' (expected: " + joinNames(data_.graphInputs) + " | " +
           joinNames(data_.graphOutputs) + ")");
     bufPtrs[idx] = kv.second;
@@ -499,7 +500,7 @@ void Executable::launch(
     if (missing.empty())
       for (auto &kv : plan_->graphOutputIdx) if (kv.second == i) missing = kv.first;
     throw std::runtime_error(
-        "kun_cuda::launch: missing argument '" + missing + "'");
+        "kun_cuda::launchOnStream: missing argument '" + missing + "'");
   }
 
   // 5.  Fill intermediate slots from the pre-allocated pool.
@@ -509,10 +510,12 @@ void Executable::launch(
     bufPtrs[i] = slotBufs_[slot];
   }
 
-  // 6.  Launch each kernel in topo order.
+  // 6.  Launch each kernel in topo order on `stream`.  Async — the
+  //     caller (Executor) owns waiting via cuStreamSynchronize.
   unsigned blockX = static_cast<unsigned>(data_.warpsPerCta * 32);
   if (blockX == 0)
-    throw std::runtime_error("kun_cuda::launch: warps_per_cta is 0");
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: warps_per_cta is 0");
   uint64_t stocksPerBlock =
       static_cast<uint64_t>(blockX) * static_cast<uint64_t>(data_.vectorSize);
   unsigned gridX = static_cast<unsigned>(
@@ -541,12 +544,28 @@ void Executable::launch(
     // cubin's `.shared` section); the dynamic-smem launch parameter does
     // not apply.
     checkCu(cuLaunchKernel(cuFuncs_[kIdx], gridX, 1, 1, blockX, 1, 1,
-                             /*sharedMemBytes=*/0, /*stream=*/nullptr,
+                             /*sharedMemBytes=*/0, stream,
                              argPtrs.data(), nullptr),
              "cuLaunchKernel");
   }
+}
+
+//===----------------------------------------------------------------------===//
+// Executor — thin CUstream wrapper, mirrors the CPU `kun::Executor` shape.
+//===----------------------------------------------------------------------===//
+
+Executor::Executor() : stream_(nullptr) {}
+Executor::Executor(CUstream stream) : stream_(stream) {}
+Executor::~Executor() = default;
+
+void Executor::runGraph(
+    Executable &exe, int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+  exe.launchOnStream(timeLength, numStocks, args, stream_);
+}
 
-  checkCu(cuCtxSynchronize(), "cuCtxSynchronize");
+void Executor::synchronize() {
+  checkCu(cuStreamSynchronize(stream_), "cuStreamSynchronize");
 }
 
 } // namespace kun_cuda
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 46ee98e..6d64332 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -253,17 +253,60 @@ PYBIND11_MODULE(KunMLIR, m) {
             [](const kun_cuda::Executable &e) {
               const auto &b = e.data().cubin;
               return py::bytes(b.data(), b.size());
-            })
-      .def("launch",
-            [](kun_cuda::Executable &e, py::dict pyArgs) {
-              auto c = collectArgs(e, pyArgs);
-              e.launch(c.timeLength, c.numStocks, c.args);
-            },
-            py::arg("args"),
-            "Launch the kernel.  `args` is a {name → cupy_array} dict; "
-            "names must match input_names ++ output_names.  All arrays "
-            "must be float32, 2-D, shape (time_length, num_stocks) — TS "
-            "layout — and reside on the GPU.");
+            });
+
+  // ── Executor ────────────────────────────────────────────────────────
+  // Mirrors the CPU `kun::Executor` shape: an opaque object that wraps a
+  // CUDA stream and exposes run_graph / synchronize.  Constructor accepts
+  // either a raw int (uintptr_t — e.g. the stream's `.ptr` from cupy) or
+  // a duck-typed object with a `.ptr` attribute (so passing a
+  // `cupy.cuda.Stream` directly Just Works).  None / no arg → default
+  // CUDA stream.
+  py::class_<kun_cuda::Executor>(m, "Executor",
+        "Wraps a CUDA stream + provides `run_graph(exe, args)` (async) "
+        "and `synchronize()`.  Default constructor uses the CUDA default "
+        "stream; pass a cupy stream (or its `.ptr` integer) to share one "
+        "with caller-managed code.")
+      .def(py::init([](py::object stream_arg) {
+            uintptr_t ptr = 0;
+            if (!stream_arg.is_none()) {
+              if (py::hasattr(stream_arg, "ptr"))
+                ptr = stream_arg.attr("ptr").cast<uintptr_t>();
+              else
+                ptr = stream_arg.cast<uintptr_t>();
+            }
+            return std::make_unique<kun_cuda::Executor>(
+                reinterpret_cast<CUstream>(ptr));
+          }),
+          py::arg("stream") = py::none(),
+          "Build an Executor.  `stream=None` → default CUDA stream; "
+          "otherwise expects either an int (uintptr_t handle) or a "
+          "cupy.cuda.Stream-like object exposing `.ptr`.")
+      .def_property_readonly("stream",
+          [](const kun_cuda::Executor &e) -> uintptr_t {
+            return reinterpret_cast<uintptr_t>(e.stream());
+          },
+          "Raw stream handle as an int (0 ↔ CUDA default stream).")
+      .def("runGraph",
+          [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
+              py::dict pyArgs) {
+            auto c = collectArgs(exe, pyArgs);
+            e.runGraph(exe, c.timeLength, c.numStocks, c.args);
+          },
+          py::arg("exe"), py::arg("args"),
+          "Queue every kernel in `exe` onto this executor's stream.\n"
+          "**Asynchronous** — call `.synchronize()` (or otherwise wait\n"
+          "on the stream) before reading results back to host.\n"
+          "\n"
+          "`args` is a {name → cupy_array} dict; names must equal "
+          "`exe.input_names ++ exe.output_names`.  Arrays must be "
+          "float32, 2-D, shape `(time_length, num_stocks)` (TS layout), "
+          "and reside on the GPU.\n"
+          "\n"
+          "Named to match the CPU executor API "
+          "(`KunRunner.runGraph(executor, mod, ...)`).")
+      .def("synchronize", &kun_cuda::Executor::synchronize,
+          "Block until every kernel queued on this stream completes.");
 
   m.def("compile", &pyCompile,
          py::arg("module"),
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 2fbfaed..98c8a92 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -42,7 +42,7 @@ def main() -> int:
     from KunQuant.jit.cuda import find_cuda_toolkit
 
     # Force-initialise the CUDA driver + create the primary context now,
-    # so subsequent KunMLIR.compile() / Executable.launch() find one.
+    # so subsequent KunMLIR.compile() / Executor.runGraph() find one.
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
 
@@ -115,8 +115,12 @@ def main() -> int:
         a   = cp.asarray(a_h)
         b   = cp.asarray(b_h)
         out = cp.zeros((T, S), dtype=cp.float32)
-        exe.launch({"a": a, "b": b, "sum": out})
-        cp.cuda.runtime.deviceSynchronize()
+        executor = KunMLIR.Executor()
+        executor.runGraph(exe, {"a": a, "b": b, "sum": out})
+        # No explicit synchronize: default-stream Executor + cupy's
+        # default stream → cp.asnumpy's D2H memcpy goes on the same
+        # stream and waits for our kernels.  See test_multi_kernel.py
+        # for the case where sync IS required (non-blocking user stream).
         out_h = cp.asnumpy(out)
         expected = a_h + b_h
         if not np.allclose(out_h, expected, atol=1e-5):
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 568d509..9157ab2 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -30,6 +30,7 @@
 from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum
 from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
 from KunQuant.Stage import Function
+from KunQuant.jit import KunMLIR
 from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
 
 
@@ -110,8 +111,9 @@ def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
     b_h = rng.standard_normal((T, S), dtype=np.float32)
     out = cp.zeros((T, S), dtype=cp.float32)
 
-    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "out": out})
-    cp.cuda.runtime.deviceSynchronize()
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h),
+                              "b": cp.asarray(b_h), "out": out})
     out_h = cp.asnumpy(out)
 
     expected = expected_fn(a_h, b_h)
@@ -159,8 +161,9 @@ def run_backref(target: str, T: int, S: int, N: int) -> int:
     b_h = rng.standard_normal((T, S), dtype=np.float32)
     out = cp.zeros((T, S), dtype=cp.float32)
 
-    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "out": out})
-    cp.cuda.runtime.deviceSynchronize()
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h),
+                              "b": cp.asarray(b_h), "out": out})
     out_h = cp.asnumpy(out)
 
     # Reference: out[t] = (a+b)[t-N] for t >= N; undefined for t < N.
@@ -194,8 +197,9 @@ def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
     b_h = rng.standard_normal((T, S), dtype=np.float32)
     out = cp.zeros((T, S), dtype=cp.float32)
 
-    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "ws": out})
-    cp.cuda.runtime.deviceSynchronize()
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h),
+                              "b": cp.asarray(b_h), "ws": out})
     out_h = cp.asnumpy(out)
 
     # Reference matches WindowedSum (same window, no NaN inputs).
@@ -238,8 +242,9 @@ def run_windowed(target: str, T: int, S: int, N: int) -> int:
     b_h = rng.standard_normal((T, S), dtype=np.float32)
     out = cp.zeros((T, S), dtype=cp.float32)
 
-    exe.launch({"a": cp.asarray(a_h), "b": cp.asarray(b_h), "ws": out})
-    cp.cuda.runtime.deviceSynchronize()
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h),
+                              "b": cp.asarray(b_h), "ws": out})
     out_h = cp.asnumpy(out)
 
     c = a_h + b_h
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index 937cd79..c1cc1a1 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -100,10 +100,15 @@ def main() -> int:
     c = cp.asarray(c_h)
     out = cp.zeros((T, S), dtype=cp.float32)
 
+    # Default-stream Executor — the simplest path.  No explicit sync
+    # needed because cp.asnumpy's D2H memcpy goes through cupy's default
+    # stream (= legacy stream 0), which is the same stream our kernels
+    # are queued on.
+    executor = KunMLIR.Executor()
     print()
-    print(f"=== launch ({T} × {S}) ===")
-    exe.launch({"a": a, "b": b, "c": c, "out": out})
-    cp.cuda.runtime.deviceSynchronize()
+    print(f"=== launch ({T} × {S}) — default stream ===")
+    print(f"  executor.stream = {executor.stream}  (0 ↔ CUDA default)")
+    executor.runGraph(exe, {"a": a, "b": b, "c": c, "out": out})
     out_h = cp.asnumpy(out)
 
     expected = (a_h + b_h) * c_h
@@ -116,24 +121,34 @@ def main() -> int:
 
     print(f"  ok — output matches (a+b)*c on every (t, s) cell ({T*S} cells)")
 
-    # === second launch with different shape — exercises slot pool re-alloc ===
+    # === second launch with different shape on a user-supplied stream ===
+    # Exercises (a) slot-pool re-alloc on shape change, (b) injecting a
+    # cupy-managed CUstream into the Executor.
     T2, S2 = T // 2, S + 64
     a2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
     b2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
     c2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
     out2 = cp.zeros((T2, S2), dtype=cp.float32)
 
+    cp_stream = cp.cuda.Stream(non_blocking=True)
+    executor2 = KunMLIR.Executor(stream=cp_stream)   # cupy stream injected
     print()
-    print(f"=== launch ({T2} × {S2}) — different shape, slot pool re-alloc ===")
-    exe.launch({"a": a2, "b": b2, "c": c2, "out": out2})
-    cp.cuda.runtime.deviceSynchronize()
+    print(f"=== launch ({T2} × {S2}) — cupy stream {hex(cp_stream.ptr)} ===")
+    print(f"  executor.stream = {hex(executor2.stream)}")
+    assert executor2.stream == cp_stream.ptr, \
+        (executor2.stream, cp_stream.ptr)
+    executor2.runGraph(exe, {"a": a2, "b": b2, "c": c2, "out": out2})
+    # Sync is REQUIRED here: cp_stream is non-blocking, so cp.asnumpy's
+    # D2H memcpy on cupy's default stream wouldn't otherwise wait for
+    # our kernels.
+    executor2.synchronize()
     out2_h = cp.asnumpy(out2)
     expected2 = (cp.asnumpy(a2) + cp.asnumpy(b2)) * cp.asnumpy(c2)
     if not np.allclose(out2_h, expected2, atol=1e-5):
         diff = np.abs(out2_h - expected2)
         print(f"  FAIL — max abs diff {diff.max()}", file=sys.stderr)
         return 1
-    print(f"  ok — re-launched with new shape, output matches")
+    print(f"  ok — re-launched on cupy stream, output matches")
     return 0
 
 
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index b5be1fb..75afad8 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -108,9 +108,9 @@ def run_one(N: int, expected_placement: str, target: str,
     b   = cp.asarray(b_h)
     out = cp.zeros((T, S), dtype=cp.float32)
 
-    exe.launch({"a": a, "b": b, "out": out})
-    cp.cuda.runtime.deviceSynchronize()
-    out_h = cp.asnumpy(out)
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": a, "b": b, "out": out})
+    out_h = cp.asnumpy(out)            # implicitly waits via stream 0
 
     expected = reference_sum_window(a_h, b_h, N)
 

From 8f4b7f61dd898c670dbcdba6983ed5174500a75e Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 8 May 2026 03:11:34 -0700
Subject: [PATCH 16/59] partition

---
 KunQuant/jit/cuda.py                 | 111 +++++++++++++++++++--------
 mlir/test/python/test_kun_to_cuda.py |  74 ++++++++++++++++++
 2 files changed, 151 insertions(+), 34 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 7b75435..be6e993 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -30,7 +30,9 @@
 
 from KunQuant.jit import KunMLIR
 
-from KunQuant.Driver import optimize
+from KunQuant.Driver import optimize, post_optimize
+from KunQuant.Op import Input, Output
+from KunQuant.passes import do_partition
 from KunQuant.Stage import Function
 from KunQuant.passes.CodegenMLIR import TargetSpec, translate_function
 
@@ -108,6 +110,13 @@ class CudaCompilerConfig:
     # Empty → upstream search: CUDA_HOME / CUDA_PATH / standard locations.
     toolkit_path:  str  = ""
 
+    # Forwarded to `do_partition` — same default as KunCompilerConfig.
+    # Larger factor ⇒ coarser partitions (fewer, bigger kernels).  After
+    # partition each sub-Function becomes one kunir.func inside the
+    # generated gpu.module; intermediate buffers between them are
+    # auto-managed by the runtime's slot pool.
+    partition_factor: int = 3
+
     # Pass-list options forwarded to optimize().  We seed reasonable GPU
     # defaults; user-supplied keys override.
     options:       Optional[dict] = None
@@ -142,43 +151,82 @@ def _to_dtype_token(dtype: str) -> str:
                        f"lowers float on GPU)")
 
 
-def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
-    """Compile a single KunQuant Function to a GPU `KunMLIR.Executable`.
+def _graph_io_names(f: Function):
+    """User-facing graph inputs/outputs.  Captured BEFORE optimize +
+    do_partition because those passes mutate `f` and may scatter the
+    Input/Output ops across multiple sub-Functions (some of which then
+    look like 'TEMP' from the partition's POV but stay user-visible at
+    the graph boundary)."""
+    ins  = [op.attrs["name"] for op in f.ops if isinstance(op, Input)]
+    outs = [op.attrs["name"] for op in f.ops if isinstance(op, Output)]
+    if not ins:
+        raise ValueError("CudaCompilerConfig: function has no Input ops")
+    if not outs:
+        raise ValueError("CudaCompilerConfig: function has no Output ops")
+    return ins, outs
 
-    The Function is mutated in place by Driver.optimize() (same as the
-    CPU path).  Inputs/Outputs declared via `Input(name)` / `Output(...,
-    name)` become the resulting Executable's graph_inputs / graph_outputs.
-    """
-    if cfg.dtype not in ("float", "double"):
-        raise ValueError(
-            f"CudaCompilerConfig.dtype must be 'float' or 'double', got "
-            f"{cfg.dtype!r}")
 
-    # Resolve the CUDA toolkit before invoking C++.  Auto-search if the
-    # user didn't pass an explicit path.  Failure here gives a useful
-    # message; failure later (in ptxas / libdevice link) is opaque.
-    toolkit_path = find_cuda_toolkit(cfg.toolkit_path)
+def _run_full_pipeline(f: Function, cfg: CudaCompilerConfig):
+    """Same pass pipeline the CPU `compileit` runs:
 
-    # 1.  Same optimizer pipeline the CPU path runs.  This is where
-    #     WindowedSum etc. decompose into ForeachBackWindow + Reduce.
+        optimize  →  do_partition  →  post_optimize
+
+    Returns the list of post-partition Functions that the translator
+    should walk (one kunir.func per Function).  Mutates `f` in place.
+    """
     options = _gpu_pass_options(cfg)
     optimize(f, options)
+    _mainf, impl = do_partition(f, cfg.partition_factor, options)
+    post_optimize(impl, options)
+    return impl
 
-    # 2.  Translate the post-optimize IR to a KunMLIR module.
+
+def _translate_partitions(impl, cfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
+    """Emit one kunir.func per partitioned Function into a single
+    KunMLIR module (single `gpu.module` with N siblings).  Cross-
+    partition buffers stitch up automatically because each impl's
+    Input/Output names match the producing/consuming partition's
+    Output/Input names."""
     target = TargetSpec(occupancy=cfg.occupancy,
                           warps_per_cta=cfg.warps_per_cta,
                           smem_size=cfg.smem_size,
                           vector_size=cfg.vector_size)
     ir = KunMLIR.IRBuilder()
-    in_names, out_names = translate_function(
-        f, target, ir, dtype=_to_dtype_token(cfg.dtype))
-    mod = ir.finish()
+    dtype = _to_dtype_token(cfg.dtype)
+    for sub in impl:
+        translate_function(sub, target, ir, dtype=dtype)
+    return ir.finish()
+
+
+def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
+    """Compile a KunQuant Function to a GPU `KunMLIR.Executable`.
+
+    Pipeline mirrors `KunQuant.jit.cfake.compileit` on the CPU path:
+
+      1. Capture user-facing Input/Output names (graph_inputs/outputs).
+      2. Run Driver.optimize on `f` in place.
+      3. do_partition splits `f` into one or more sub-Functions.
+      4. post_optimize per sub-Function (TempWindowElim + MergeLoops + …).
+      5. Translate each sub-Function into a kunir.func (siblings in one
+         gpu.module).
+      6. Hand off to KunMLIR.compile, which generates the cubin and
+         resolves cross-kernel data flow via I/O names.
+    """
+    if cfg.dtype not in ("float", "double"):
+        raise ValueError(
+            f"CudaCompilerConfig.dtype must be 'float' or 'double', got "
+            f"{cfg.dtype!r}")
+
+    toolkit_path = find_cuda_toolkit(cfg.toolkit_path)
+
+    graph_inputs, graph_outputs = _graph_io_names(f)
+    impl = _run_full_pipeline(f, cfg)
+    mod  = _translate_partitions(impl, cfg)
 
-    # 3.  Hand off to the KunMLIR compile pipeline.
     return KunMLIR.compile(
         mod,
-        graph_inputs=in_names,
-        graph_outputs=out_names,
+        graph_inputs=graph_inputs,
+        graph_outputs=graph_outputs,
         gpu_arch=cfg.gpu_arch,
         opt_level=cfg.opt_level,
         toolkit_path=toolkit_path,
@@ -187,13 +235,8 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
 
 def to_mlir(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
     """Run the same passes + translator as `compileit`, but return the
-    KunMLIR module before PTX/CUBIN.  Useful for debugging the IR."""
-    options = _gpu_pass_options(cfg)
-    optimize(f, options)
-    target = TargetSpec(occupancy=cfg.occupancy,
-                          warps_per_cta=cfg.warps_per_cta,
-                          smem_size=cfg.smem_size,
-                          vector_size=cfg.vector_size)
-    ir = KunMLIR.IRBuilder()
-    translate_function(f, target, ir, dtype=_to_dtype_token(cfg.dtype))
-    return ir.finish()
+    KunMLIR module before PTX/CUBIN.  Useful for debugging the IR.
+    Mutates `f` in place (same as `compileit`)."""
+    _graph_io_names(f)              # raises if no Input / Output ops
+    impl = _run_full_pipeline(f, cfg)
+    return _translate_partitions(impl, cfg)
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 9157ab2..cfef71c 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -90,6 +90,21 @@ def build_func_fastwindowedsum(N: int) -> Function:
     return Function(builder.ops, name="fastwindowedsum_kernel")
 
 
+def build_func_multipartition() -> Function:
+    """A graph with three independent outputs.  Combined with
+    `partition_factor=1` this drives `do_partition` to split into
+    multiple sub-Functions, each becoming its own kunir.func — the
+    primary thing this test exercises."""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        Output(Add(a, bin_), "add_out")
+        Output(Mul(a, bin_), "mul_out")
+        Output(Sub(a, bin_), "sub_out")
+    return Function(builder.ops, name="multi")
+
+
 def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
               atol: float = 1e-5) -> int:
     """Compile a Function, launch it, validate against numpy."""
@@ -223,6 +238,63 @@ def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
     return 0
 
 
+def run_multipartition(target: str, T: int, S: int) -> int:
+    """End-to-end test of the do_partition + post_optimize path:
+    three independent outputs forced into separate partitions by
+    `partition_factor=1`, each becoming a sibling kunir.func in the
+    generated gpu.module."""
+    print("=== multipartition: 3 outputs (add/mul/sub) split via "
+           "partition_factor=1 ===")
+    f = build_func_multipartition()
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
+                              partition_factor=1)
+
+    mod = to_mlir(build_func_multipartition(), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernel_names           = {exe.kernel_names}")
+    print(f"  num_kernels            = {exe.num_kernels}")
+    print(f"  launch_order           = {exe.launch_order}")
+    print(f"  num_buffers            = {exe.num_buffers}")
+    print(f"  peak_intermediate_slots= {exe.peak_intermediate_slots}")
+
+    # The point of the test: the partitioner actually produced more
+    # than one kunir.func.  No intermediates because the three outputs
+    # are independent (each consumes only graph inputs).
+    assert exe.num_kernels >= 2, exe.num_kernels
+    assert exe.peak_intermediate_slots == 0, exe.peak_intermediate_slots
+    assert set(exe.input_names)  == {"a", "b"}
+    assert set(exe.output_names) == {"add_out", "mul_out", "sub_out"}
+
+    import cupy as cp
+    rng = np.random.default_rng(7)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    add_out = cp.zeros((T, S), dtype=cp.float32)
+    mul_out = cp.zeros((T, S), dtype=cp.float32)
+    sub_out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h), "b": cp.asarray(b_h),
+                              "add_out": add_out,
+                              "mul_out": mul_out,
+                              "sub_out": sub_out})
+
+    add_h = cp.asnumpy(add_out)
+    mul_h = cp.asnumpy(mul_out)
+    sub_h = cp.asnumpy(sub_out)
+    if not (np.allclose(add_h, a_h + b_h, atol=1e-5)
+            and np.allclose(mul_h, a_h * b_h, atol=1e-5)
+            and np.allclose(sub_h, a_h - b_h, atol=1e-5)):
+        print(f"  FAIL — at least one of add/mul/sub mismatch",
+                file=sys.stderr)
+        return 1
+    print(f"  ok — all 3 outputs match across {exe.num_kernels} kernels")
+    return 0
+
+
 def run_windowed(target: str, T: int, S: int, N: int) -> int:
     print(f"=== windowed: ws = WindowedSum(a + b, N={N}) ===")
     f = build_func_windowed(N)
@@ -290,6 +362,8 @@ def main() -> int:
     print()
     rc |= run_fastwindowedsum(args.target, args.time_length, args.num_stocks,
                                 args.window)
+    print()
+    rc |= run_multipartition(args.target, args.time_length, args.num_stocks)
     return rc
 
 

From 97fc8a07a129e99a517d39c34ce0ef27b31e35ca Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 12 May 2026 00:38:37 -0700
Subject: [PATCH 17/59] enable cs_rank

---
 KunQuant/jit/cuda.py                  |  39 ++-
 KunQuant/passes/CodegenMLIR.py        |  84 +++++-
 mlir/CMakeLists.txt                   |  14 +-
 mlir/include/KunCuda/Runtime.h        |  36 ++-
 mlir/include/KunIr/KunIrOps.td        |  24 +-
 mlir/lib/KunCuda/CMakeLists.txt       | 109 +++++++-
 mlir/lib/KunCuda/EmbedFile.cmake      |  48 ++++
 mlir/lib/KunCuda/Runtime.cpp          | 381 +++++++++++++++++++++-----
 mlir/lib/KunCuda/kernels/cs_rank.cu   | 123 +++++++++
 mlir/lib/KunGpu/PtxBackend.cpp        |  12 +-
 mlir/lib/KunIr/KunIrOps.cpp           |   9 +-
 mlir/lib/KunIr/KunIrToKunGpu.cpp      |   8 +-
 mlir/lib/Python/IRBuilder.cpp         |   2 -
 mlir/lib/Python/MlirBinding.cpp       |  58 +++-
 mlir/test/kunir/basic.mlir            |  11 -
 mlir/test/python/test_cs_rank_cuda.py | 265 ++++++++++++++++++
 16 files changed, 1078 insertions(+), 145 deletions(-)
 create mode 100644 mlir/lib/KunCuda/EmbedFile.cmake
 create mode 100644 mlir/lib/KunCuda/kernels/cs_rank.cu
 create mode 100644 mlir/test/python/test_cs_rank_cuda.py

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index be6e993..3076d3d 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -181,21 +181,34 @@ def _run_full_pipeline(f: Function, cfg: CudaCompilerConfig):
     return impl
 
 
-def _translate_partitions(impl, cfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
+def _translate_partitions(impl, cfg: CudaCompilerConfig):
     """Emit one kunir.func per partitioned Function into a single
     KunMLIR module (single `gpu.module` with N siblings).  Cross-
     partition buffers stitch up automatically because each impl's
     Input/Output names match the producing/consuming partition's
-    Output/Input names."""
+    Output/Input names.
+
+    Cross-sectional partitions (currently: cs_rank) bypass the kunir
+    pipeline entirely — `translate_function` returns a descriptor and
+    we collect those into `external_kernels`, which the C++ side
+    appends to the executable's kernel list without ever generating
+    LLVM IR / PTX for them.
+
+    Returns (ModuleOp, list[dict]) — the second element is the list
+    of external-kernel descriptors to forward to KunMLIR.compile.
+    """
     target = TargetSpec(occupancy=cfg.occupancy,
                           warps_per_cta=cfg.warps_per_cta,
                           smem_size=cfg.smem_size,
                           vector_size=cfg.vector_size)
     ir = KunMLIR.IRBuilder()
     dtype = _to_dtype_token(cfg.dtype)
+    externals = []
     for sub in impl:
-        translate_function(sub, target, ir, dtype=dtype)
-    return ir.finish()
+        ext = translate_function(sub, target, ir, dtype=dtype)
+        if ext is not None:
+            externals.append(ext)
+    return ir.finish(), externals
 
 
 def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
@@ -221,7 +234,7 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
 
     graph_inputs, graph_outputs = _graph_io_names(f)
     impl = _run_full_pipeline(f, cfg)
-    mod  = _translate_partitions(impl, cfg)
+    mod, externals = _translate_partitions(impl, cfg)
 
     return KunMLIR.compile(
         mod,
@@ -230,13 +243,23 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
         gpu_arch=cfg.gpu_arch,
         opt_level=cfg.opt_level,
         toolkit_path=toolkit_path,
+        external_kernels=externals,
+        # Forwarded for the no-JIT-kernel case: when every partition
+        # is external (e.g. a graph that is just `cs_rank(a)`), the
+        # MLIR module is empty and `data.warpsPerCta` would otherwise
+        # default to 1 — but the cs_rank launch uses it to size
+        # blockDim, so feed the config value through.
+        warps_per_cta=cfg.warps_per_cta,
     )
 
 
 def to_mlir(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
     """Run the same passes + translator as `compileit`, but return the
-    KunMLIR module before PTX/CUBIN.  Useful for debugging the IR.
-    Mutates `f` in place (same as `compileit`)."""
+    KunMLIR module before PTX/CUBIN.  External (cs_rank) partitions
+    are absent from the returned module — they never become kunir
+    ops.  Useful for debugging the IR.  Mutates `f` in place (same
+    as `compileit`)."""
     _graph_io_names(f)              # raises if no Input / Output ops
     impl = _run_full_pipeline(f, cfg)
-    return _translate_partitions(impl, cfg)
+    mod, _externals = _translate_partitions(impl, cfg)
+    return mod
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index a6dcae8..7e14a90 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -22,7 +22,7 @@
 
 from KunQuant.Op import (
     OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
-    ReductionOp, Rank,
+    ReductionOp, SimpleCrossSectionalOp,
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Sign,
@@ -41,7 +41,10 @@
     Max: "max", Min: "min",
 }
 _UNARY = {
-    Abs: "abs", Log: "log", Sign: "sign", Rank: "cs_rank",
+    Abs: "abs", Log: "log", Sign: "sign",
+    # NOTE: `Rank` is intentionally absent.  Cross-sectional rank
+    # partitions are routed to a pre-compiled CUmodule by
+    # `_maybe_external_partition` below; they never become kunir ops.
 }
 _REDUCE = {
     ReduceAdd: "reduce_add", ReduceMul: "reduce_mul",
@@ -63,6 +66,25 @@ def __init__(self, *, occupancy: int = 1, warps_per_cta: int = 4,
 
 # ── Helpers ─────────────────────────────────────────────────────────
 
+def _kunir_symbol(name: str) -> str:
+    """Coerce a partition name into a valid kunir / PTX symbol.
+
+    The partitioner derives a partition's name from the names of its
+    Output ops; when a partition is "intermediate-only" (every output
+    is consumed by a downstream partition, none is a user-facing
+    Output), those names come from `OpBase.hash_hex` which starts with
+    a digit half the time.  Digits are fine for buffer-table keys
+    (CPU runtime indexes by name) but ptxas rejects them as
+    `.entry` symbols.
+
+    Prefix any such name with a single `_` so the kunir.func symbol
+    is always a valid identifier, while leaving `input_names` /
+    `output_names` (the public buffer-table keys) untouched.
+    """
+    if name and name[0].isdigit():
+        return "_" + name
+    return name
+
 def _index_loop_members(f: Function) -> Tuple[
         Dict[ForeachBackWindow, List[OpBase]],
         Dict[ForeachBackWindow, List[ReductionOp]]]:
@@ -117,14 +139,62 @@ def _emit_reduction(op: ReductionOp, ir, val_map: Dict[OpBase, object]):
 
 # ── Main entry point ────────────────────────────────────────────────
 
+def _maybe_external_partition(f: Function, dtype: str):
+    """If `f` is a partition the GPU runtime handles as a pre-compiled
+    external kernel (bundled PTX loaded as a separate CUmodule), return
+    a descriptor dict that KunMLIR.compile() should append to the
+    executable's kernel list.  Otherwise return None.
+
+    The descriptor matches what KunMLIR.compile's `external_kernels=`
+    parameter expects:
+        {"name": <str>, "kind": <str>,
+         "inputs": [<str>...], "outputs": [<str>...]}
+
+    Detection mirrors CodegenCpp's "simple cross-sectional fast path"
+    (CodegenCpp.codegen_cpp's `len(f.ops) == 3` check): a partition
+    whose only compute op is a `SimpleCrossSectionalOp` (Rank, Scale,
+    …).  The partitioner places every CrossSectionalOp into its own
+    partition without other compute, so this shape is what we get.
+
+    The `kind` string is `cs_<lowercased class name>_f{32,64}`, e.g.
+    `cs_rank_f32`, `cs_scale_f64`.  The C++ binding maps it to a
+    `KernelKind` enum; unknown kinds raise there with a clear error,
+    so adding a new SimpleCrossSectionalOp on the Python side does
+    not silently succeed without a matching bundled PTX kernel.
+    """
+    compute = [op for op in f.ops
+                if not isinstance(op, (Input, Output))]
+    if len(compute) != 1 or not isinstance(compute[0], SimpleCrossSectionalOp):
+        return None
+    inputs  = [op for op in f.ops if isinstance(op, Input)]
+    outputs = [op for op in f.ops if isinstance(op, Output)]
+    if len(inputs) != 1 or len(outputs) != 1:
+        return None  # surprising shape, let the regular path emit an error
+    if dtype not in ("f32", "f64"):
+        return None
+    op_kind = compute[0].__class__.__name__.lower()
+    return {
+        "name":    f.name or f"cs_{op_kind}",
+        "kind":    f"cs_{op_kind}_{dtype}",
+        "inputs":  [op.attrs["name"] for op in inputs],
+        "outputs": [op.attrs["name"] for op in outputs],
+    }
+
+
 def translate_function(f: Function, target: TargetSpec, ir,
                         dtype: str = "f32"):
     """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
 
-    Returns the list of (input_name, output_name) declared on the func,
-    so the caller can pass them straight to KunMLIR.compile() as
-    graph_inputs / graph_outputs.
+    If `f` is an externally-dispatched partition (e.g. a single cs_rank
+    op handled by the bundled cs_rank.ptx CUmodule), emit nothing into
+    the IRBuilder and return its descriptor dict so the caller can pass
+    it to KunMLIR.compile()'s `external_kernels=` list.  Otherwise
+    return `None` after emitting a kunir.func.
     """
+    ext = _maybe_external_partition(f, dtype)
+    if ext is not None:
+        return ext
+
     # 1.  Boundary ops in topo order — the kunir.func's I/O.
     inputs:  List[Input]  = [op for op in f.ops if isinstance(op, Input)]
     outputs: List[Output] = [op for op in f.ops if isinstance(op, Output)]
@@ -147,7 +217,7 @@ def translate_function(f: Function, target: TargetSpec, ir,
     ts_1   = ir.ts_type(dtype, 1)
 
     func_args = ir.begin_func(
-        name=f.name or "kernel",
+        name=_kunir_symbol(f.name or "kernel"),
         input_types=[ts_inf] * len(inputs),
         input_names=in_names,
         output_names=out_names,
@@ -190,7 +260,7 @@ def translate_function(f: Function, target: TargetSpec, ir,
     # 5.  Close the function with Outputs in declared order.
     return_values = [val_map[o.inputs[0]] for o in outputs]
     ir.end_func(return_values)
-    return in_names, out_names
+    return None
 
 
 def _emit_loop(loop: ForeachBackWindow, ir, val_map, ts_1,
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index b3a1598..fc04575 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -8,7 +8,19 @@ include_directories(${MLIR_INCLUDE_DIRS})
 include_directories(${KUN_MLIR_SOURCE_DIR}/include)
 include_directories(${KUN_MLIR_BINARY_DIR}/include)
 
-add_definitions(${LLVM_DEFINITIONS})
+# LLVMConfig.cmake exports LLVM_DEFINITIONS as a single space-separated
+# *string* (e.g. "-D_GNU_SOURCE -D_GLIBCXX_USE_CXX11_ABI=1 ...").
+# Plain `add_definitions(${LLVM_DEFINITIONS})` works most of the time
+# (CMake splits on whitespace during unquoted expansion), but some
+# CMake versions cache the string as a single COMPILE_DEFINITIONS
+# entry, which then re-emits as one malformed `-D_GLIBCXX_USE_CXX11_ABI="1
+# -D__STDC_CONSTANT_MACROS ..."` flag — duplicating the macro and
+# producing the "_GLIBCXX_USE_CXX11_ABI redefined" warning.
+#
+# Tokenise the string explicitly first to keep CMake honest, then add.
+separate_arguments(_KUN_LLVM_DEFS UNIX_COMMAND "${LLVM_DEFINITIONS}")
+add_compile_options(${_KUN_LLVM_DEFS})
+unset(_KUN_LLVM_DEFS)
 
 # The MLIR backend requires LLVM to have been built with the NVPTX target
 # — we emit PTX and load cubins.  Fail loudly here so the diagnostic is
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index dd47090..8ff5d47 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -57,11 +57,25 @@ struct GraphPlan;
 // Compile-time output (all names — runtime resolves them to indices)
 //===----------------------------------------------------------------------===//
 
+/// Kernel dispatch kind.  `Jit` kernels live in the cubin produced by
+/// the MLIR pipeline and are launched with the project-wide stock-major
+/// grid (block_x = warps_per_cta * 32, grid_x = ceil(S / block_x)).
+/// `ExtCsRank*` kernels are pre-compiled PTX bundled inside
+/// libKunCudaRuntime; the executor lazy-loads them as a second
+/// CUmodule and launches them with a time-major grid + dynamic shared
+/// memory sized to the cross-section (one CTA per timestep).
+enum class KernelKind : int32_t {
+  Jit          = 0,
+  ExtCsRankF32 = 1,
+  ExtCsRankF64 = 2,
+};
+
 /// Per-kernel metadata, in name form.  This is what the compiler can
 /// produce by walking a single lowered llvm.func — no graph topology
 /// reasoning required.
 struct KernelMeta {
-  std::string kernelName;                    ///< symbol in the cubin
+  std::string kernelName;                    ///< symbol in the cubin (Jit) or in the bundled PTX (ExtCsRank*)
+  KernelKind kind = KernelKind::Jit;         ///< picked by the MLIR pass; default is the regular path
   std::vector<std::string> inputNames;       ///< kungpu.input_names, in argv order
   std::vector<std::string> outputNames;      ///< kungpu.output_names, in argv order
 };
@@ -149,11 +163,19 @@ class Executable {
   ///   block_x = warps_per_cta * 32
   ///   grid_x  = ceil_div(numStocks, block_x * vector_size)
   ///
+  /// `devMaxSmemBytes` is the device's MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+  /// cached by the caller (`Executor`) so the runtime can validate
+  /// `num_stocks * sizeof(T)` against the GPU's smem cap before
+  /// invoking cuLaunchKernel for external cs_rank kernels.  Pass 0 if
+  /// there are no external kernels in the executable (the check is a
+  /// no-op in that case).
+  ///
   /// Throws std::runtime_error on validation or driver errors.  This is
   /// a low-level entry point — most users go through `Executor::runGraph`.
   void launchOnStream(int64_t timeLength, int64_t numStocks,
                        const std::vector<std::pair<std::string, uintptr_t>> &args,
-                       CUstream stream);
+                       CUstream stream,
+                       int devMaxSmemBytes);
 
 private:
   /// Allocate (or re-allocate, if shape changed) the intermediate slot
@@ -166,6 +188,10 @@ class Executable {
   std::unique_ptr<GraphPlan> plan_;          ///< pImpl — defined in Runtime.cpp
 
   CUmodule cuModule_ = nullptr;
+  /// Module holding the pre-compiled cs_rank PTX.  Loaded at
+  /// construction time iff any kernel has `kind != Jit`; null
+  /// otherwise.
+  CUmodule csRankModule_ = nullptr;
   std::vector<CUfunction> cuFuncs_;          ///< parallel to data_.kernels
 
   // Lazily allocated intermediate buffers, one CUdeviceptr per slot
@@ -220,9 +246,15 @@ class Executor {
 
   /// Raw stream handle (default-stream Executor returns nullptr).
   CUstream stream() const noexcept { return stream_; }
+  /// Cached MAX_SHARED_MEMORY_PER_BLOCK_OPTIN of the device this
+  /// Executor's CUcontext is bound to, queried once at construction.
+  /// Used to validate cs_rank dynamic-smem requests at launch time
+  /// without a per-launch driver call.
+  int devMaxSmemBytes() const noexcept { return devMaxSmemBytes_; }
 
 private:
   CUstream stream_ = nullptr;
+  int devMaxSmemBytes_ = 0;
 };
 
 } // namespace kun_cuda
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 4dcd4d2..cedfec5 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -86,27 +86,13 @@ def KunIr_SignOp : KunIr_UnaryElemwiseOp<"sign"> {
 //===----------------------------------------------------------------------===//
 // Cross-sectional ops
 //
-// Operates time-step-by-time-step across all stocks. Result has maxLookback=1.
+// kunir has no first-class cross-sectional op.  Cross-sectional kernels
+// (currently only cs_rank) are routed by the Python frontend
+// (KunQuant.passes.CodegenMLIR._maybe_external_partition) directly to a
+// pre-compiled CUmodule bundled with the runtime — they never appear
+// in kunir IR or the MLIR pipeline.
 //===----------------------------------------------------------------------===//
 
-def KunIr_CsRankOp : KunIr_Op<"cs_rank", [
-    Pure,
-    InferTypeOpInterface,
-    NativeOpTrait<"KunIrElemwiseTsResultType">
-]> {
-  let summary = "Cross-sectional rank of a time series";
-  let description = [{
-    For each time step t, computes the rank of each stock's value among
-    all stocks at time t. Output values are in the range [0, 1].
-    The input may have any maxLookback; the result always has maxLookback = 1.
-  }];
-  let arguments = (ins KunIr_AnyTs:$input);
-  let results = (outs KunIr_AnyTs:$result);
-  let hasVerifier = 1;
-  let assemblyFormat =
-    "$input `:` type($input) attr-dict";
-}
-
 //===----------------------------------------------------------------------===//
 // WindowedOutput op
 //
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 245d10c..434d1b6 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -3,7 +3,97 @@
 # struct, the Executable class (cuModuleLoadData / cuLaunchKernel) and
 # nothing else.
 
-add_library(KunCudaRuntime SHARED Runtime.cpp)
+# ── Locate the CUDA toolkit ──────────────────────────────────────────
+# Standard CMake CUDA discovery.  Honours, in order:
+#   1. -DCUDAToolkit_ROOT=<path> on the cmake command line
+#   2. $CUDAToolkit_ROOT, $CUDA_PATH, $CUDA_HOME env vars
+#   3. Standard install locations (/usr/local/cuda, …)
+# See the CMake docs for FindCUDAToolkit — same module the rest of the
+# ecosystem uses, no custom validation needed.
+#
+# We then point CMake's CUDA-language support at the same nvcc the
+# find_package result exposed, so `enable_language(CUDA)` doesn't pick
+# up a different toolkit off PATH.
+find_package(CUDAToolkit REQUIRED)
+if(NOT CMAKE_CUDA_COMPILER)
+  set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}" CACHE FILEPATH
+      "nvcc used by CMake's CUDA-language support.")
+endif()
+enable_language(CUDA)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+# sm_75 (Turing) baseline.  cs_rank — and any other bundled kernel
+# we add to kernels/ — uses no arch-specific instructions, so sm_75
+# covers every NVIDIA GPU CUDA 13 still supports.  Later GPUs run
+# the PTX via driver-JIT (cached system-wide in ~/.nv/ComputeCache).
+set(CMAKE_CUDA_ARCHITECTURES 75)
+
+get_filename_component(_kun_toolkit_root "${CUDAToolkit_BIN_DIR}" DIRECTORY)
+message(STATUS
+    "KunCudaRuntime: CUDA toolkit = ${_kun_toolkit_root} "
+    "(version ${CUDAToolkit_VERSION})")
+
+# ── Recipe: compile `.cu` → `.ptx` → embed as a C array ──────────────
+# Drop a new `kernels/<name>.cu` and the foreach below picks it up —
+# the kernel becomes available to Runtime.cpp via
+# `#include "<name>_ptx.inc"` exposing `kun_<name>_ptx[]`.
+#
+# nvcc stamps a `.version` directive on every emitted PTX matching
+# its own toolkit release, which the deployed CUDA driver may not
+# yet understand.  Our kernels here all use baseline features
+# available since PTX 7.x, so EmbedFile.cmake's PTX_VERSION knob
+# rewrites the `.version` directive to 7.8 before embedding —
+# accepted by every driver from R510 (CUDA 11.6) onwards.  Same
+# header-rewrite trick Triton uses when its LLVM emits new ISA.
+set(_kun_ptx_embed_includes "")
+set(_kun_embed_cmake "${CMAKE_CURRENT_SOURCE_DIR}/EmbedFile.cmake")
+
+function(kun_add_bundled_ptx_kernel cu_path)
+  get_filename_component(_stem "${cu_path}" NAME_WE)
+  set(_obj_tgt "KunPtxObj_${_stem}")
+  set(_inc     "${CMAKE_CURRENT_BINARY_DIR}/${_stem}_ptx.inc")
+  set(_symbol  "kun_${_stem}_ptx")
+
+  # Native CUDA OBJECT lib with CUDA_PTX_COMPILATION → CMake invokes
+  # nvcc with `-ptx`; output is one .ptx per source.
+  add_library(${_obj_tgt} OBJECT "${cu_path}")
+  set_target_properties(${_obj_tgt} PROPERTIES
+      CUDA_PTX_COMPILATION ON)
+  target_compile_options(${_obj_tgt} PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
+
+  # Embed step — `$<TARGET_OBJECTS:tgt>` is a generator expression that
+  # resolves to the .ptx output path(s) at build time.
+  add_custom_command(
+      OUTPUT  "${_inc}"
+      COMMAND "${CMAKE_COMMAND}"
+              -DINPUT=$<TARGET_OBJECTS:${_obj_tgt}>
+              -DOUTPUT=${_inc}
+              -DSYMBOL=${_symbol}
+              -DPTX_VERSION=7.8
+              -P "${_kun_embed_cmake}"
+      DEPENDS ${_obj_tgt} "${_kun_embed_cmake}"
+      COMMENT "Embedding ${_stem}.ptx as ${_symbol}[] (downgrading to ISA 7.8)"
+      VERBATIM
+      COMMAND_EXPAND_LISTS)
+
+  # Hand the generated header back to the parent scope (proper list
+  # append: read the parent's value via the auto-captured local,
+  # APPEND in-function, write back).
+  set(_local "${_kun_ptx_embed_includes}")
+  list(APPEND _local "${_inc}")
+  set(_kun_ptx_embed_includes "${_local}" PARENT_SCOPE)
+endfunction()
+
+# All bundled kernels live in kernels/.  CONFIGURE_DEPENDS makes
+# ninja/make re-run cmake when a new .cu is added — no manual re-config.
+file(GLOB _kun_cu_sources CONFIGURE_DEPENDS
+     "${CMAKE_CURRENT_SOURCE_DIR}/kernels/*.cu")
+foreach(_cu IN LISTS _kun_cu_sources)
+  kun_add_bundled_ptx_kernel("${_cu}")
+endforeach()
+
+add_library(KunCudaRuntime SHARED Runtime.cpp ${_kun_ptx_embed_includes})
 
 # Project-wide compile flags set -fvisibility=hidden + inlines-hidden to
 # minimise the size of MLIR static libs.  This shared runtime needs to
@@ -22,14 +112,15 @@ set_target_properties(KunCudaRuntime PROPERTIES
 target_include_directories(KunCudaRuntime PUBLIC
     "${PROJECT_SOURCE_DIR}/mlir/include")
 
-# CUDA Driver API (cuda.h + libcuda stub).  Resolved via CMake's
-# standard FindCUDAToolkit — pass -DCUDAToolkit_ROOT=<path> or set
-# $CUDA_PATH if the toolkit is not in a default search location.
-# The stub is only used at link time; the runtime loader resolves
-# libcuda.so.1 from the NVIDIA driver.  This dep is PRIVATE:
-# downstream consumers see only Runtime.h, which never includes
-# <cuda.h>.
-find_package(CUDAToolkit REQUIRED)
+# Runtime.cpp #include's the generated <kernel>_ptx.inc files.
+target_include_directories(KunCudaRuntime PRIVATE
+    "${CMAKE_CURRENT_BINARY_DIR}")
+
+# CUDA Driver API.  `CUDA::cuda_driver` is FindCUDAToolkit's imported
+# target wrapping `lib64/stubs/libcuda.so` with the right INCLUDE
+# INTERFACE — gets cuda.h + the link-time stub in one go.  This dep
+# is PRIVATE: downstream consumers see only Runtime.h, which never
+# includes <cuda.h>.
 target_link_libraries(KunCudaRuntime PRIVATE CUDA::cuda_driver)
 
 # Like the Python module, this shared library has no CPython symbols to
diff --git a/mlir/lib/KunCuda/EmbedFile.cmake b/mlir/lib/KunCuda/EmbedFile.cmake
new file mode 100644
index 0000000..07eed3e
--- /dev/null
+++ b/mlir/lib/KunCuda/EmbedFile.cmake
@@ -0,0 +1,48 @@
+# Embed a binary file as a C++ unsigned-char array.
+#
+# Usage: cmake -DINPUT=foo.ptx -DOUTPUT=foo_ptx.inc -DSYMBOL=kun_cs_rank_ptx
+#              [-DPTX_VERSION=7.8]   # optional: rewrite the `.version` directive
+#              -P EmbedFile.cmake
+#
+# Produces (in OUTPUT):
+#   static const unsigned char SYMBOL[] = { 0x12, 0x34, ... };
+#   static const unsigned int  SYMBOL_len = N;
+#
+# If PTX_VERSION is set, the input is read as text and its first
+# `.version X.Y` line is replaced before encoding — useful when nvcc
+# emits a newer ISA than the deployed CUDA driver supports.
+
+if(NOT INPUT OR NOT OUTPUT OR NOT SYMBOL)
+  message(FATAL_ERROR
+      "EmbedFile.cmake: INPUT, OUTPUT and SYMBOL must all be set")
+endif()
+
+if(PTX_VERSION)
+  file(READ "${INPUT}" text_content)
+  string(REGEX REPLACE "\\.version[ \\t]+[0-9.]+" ".version ${PTX_VERSION}"
+                       text_content "${text_content}")
+  set(_patched "${OUTPUT}.raw.ptx")
+  file(WRITE "${_patched}" "${text_content}")
+  file(READ "${_patched}" hex_content HEX)
+  file(REMOVE "${_patched}")
+else()
+  file(READ "${INPUT}" hex_content HEX)
+endif()
+string(LENGTH "${hex_content}" hex_len)
+math(EXPR n_bytes "${hex_len} / 2")
+
+# "abcd" → "0xab,0xcd,"
+string(REGEX REPLACE "(..)" "0x\\1," byte_list "${hex_content}")
+# Trim the trailing comma.
+string(REGEX REPLACE ",$" "" byte_list "${byte_list}")
+# Insert a newline every 16 bytes to keep the generated file diffable.
+string(REGEX REPLACE "(0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,)"
+                     "\\1\n  " byte_list "${byte_list}")
+
+file(WRITE "${OUTPUT}"
+"// Generated from \"${INPUT}\".  Do not edit by hand.
+static const unsigned char ${SYMBOL}[] = {
+  ${byte_list}
+};
+static const unsigned int ${SYMBOL}_len = ${n_bytes};
+")
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index c998c19..fdffa05 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -26,6 +26,10 @@
 #include <stdexcept>
 #include <unordered_map>
 
+// Pre-compiled cs_rank PTX, embedded by EmbedFile.cmake.  Exposes
+// `kun_cs_rank_ptx[]` (bytes) and `kun_cs_rank_ptx_len`.
+#include "cs_rank_ptx.inc"
+
 namespace kun_cuda {
 
 //===----------------------------------------------------------------------===//
@@ -347,6 +351,242 @@ SlotPlan planSlots(const std::vector<int> &launchOrder,
   return plan;
 }
 
+//===----------------------------------------------------------------------===//
+// Launch helpers — pure functions used by launchOnStream below.
+//===----------------------------------------------------------------------===//
+
+/// Translate the user-supplied {name → device_ptr} args dict into a
+/// flat buffer-index → pointer array, plug in the executable-owned
+/// intermediate-slot pointers, and verify every graph_input /
+/// graph_output the plan expects was provided.  Throws on unknown or
+/// missing names.
+static std::vector<uintptr_t> resolveBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    const std::vector<uintptr_t> &slotBufs) {
+  std::vector<uintptr_t> bufPtrs(plan.numBuffers, 0);
+  std::vector<bool>      filled(plan.numBuffers, false);
+
+  for (const auto &kv : args) {
+    auto itIn  = plan.graphInputIdx.find(kv.first);
+    auto itOut = plan.graphOutputIdx.find(kv.first);
+    int idx = -1;
+    if (itIn != plan.graphInputIdx.end())
+      idx = itIn->second;
+    else if (itOut != plan.graphOutputIdx.end())
+      idx = itOut->second;
+    else
+      throw std::runtime_error(
+          "kun_cuda::launchOnStream: unexpected argument '" + kv.first +
+          "' (expected: " + joinNames(data.graphInputs) + " | " +
+          joinNames(data.graphOutputs) + ")");
+    bufPtrs[idx] = kv.second;
+    filled[idx] = true;
+  }
+
+  // Confirm every graph_input + graph_output was supplied.
+  for (int i = 0; i < plan.numGraphInputs + plan.numGraphOutputs; ++i) {
+    if (filled[i]) continue;
+    std::string missing;
+    for (auto &kv : plan.graphInputIdx)  if (kv.second == i) missing = kv.first;
+    if (missing.empty())
+      for (auto &kv : plan.graphOutputIdx) if (kv.second == i) missing = kv.first;
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: missing argument '" + missing + "'");
+  }
+
+  // Intermediates: index into the pre-allocated slot pool.
+  for (int i = plan.numGraphInputs + plan.numGraphOutputs;
+        i < plan.numBuffers; ++i) {
+    int slot = plan.intermediateBufToSlot[i];
+    bufPtrs[i] = slotBufs[slot];
+  }
+  return bufPtrs;
+}
+
+/// Stock-major launch: block_x = warps_per_cta*32, grid_x =
+/// ceil(numStocks / (block_x * vector_size)), no dynamic smem.
+static void launchJitKernel(CUfunction fn,
+                              int64_t numStocks,
+                              int64_t warpsPerCta, int64_t vectorSize,
+                              void **args, CUstream stream) {
+  unsigned blockX = static_cast<unsigned>(warpsPerCta * 32);
+  uint64_t stocksPerBlock =
+      static_cast<uint64_t>(blockX) * static_cast<uint64_t>(vectorSize);
+  unsigned gridX = static_cast<unsigned>(
+      (static_cast<uint64_t>(numStocks) + stocksPerBlock - 1) /
+      stocksPerBlock);
+  // sharedMemBytes = 0 — JIT'd kernels declare static smem via
+  // llvm.mlir.global addr_space=3; the dynamic-smem launch parameter
+  // does not apply.
+  checkCu(cuLaunchKernel(fn, gridX, 1, 1, blockX, 1, 1,
+                           /*sharedMemBytes=*/0, stream, args, nullptr),
+           "cuLaunchKernel");
+}
+
+/// External cs_rank launch: block_x = warps_per_cta*32, grid_x =
+/// time_length (one CTA per timestep), sharedMemBytes = numStocks *
+/// sizeof(T).  Checks the request against the cached device cap so
+/// we fail with a clear, GPU-aware message instead of letting
+/// cuLaunchKernel emit its generic error.
+static void launchExtCsRankKernel(CUfunction fn, KernelKind kind,
+                                    const std::string &kernelName,
+                                    int64_t timeLength, int64_t numStocks,
+                                    int64_t warpsPerCta,
+                                    int devMaxSmemBytes,
+                                    void **args, CUstream stream) {
+  size_t elemSize = (kind == KernelKind::ExtCsRankF64) ? 8u : 4u;
+  uint64_t smemBytes64 =
+      static_cast<uint64_t>(numStocks) * static_cast<uint64_t>(elemSize);
+
+  if (devMaxSmemBytes <= 0)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: external cs_rank kernel '" + kernelName +
+        "' requires Executor's devMaxSmemBytes to be set; got 0.  "
+        "Construct the Executable through Executor::runGraph, or pass "
+        "devMaxSmemBytes when calling launchOnStream directly.");
+  if (smemBytes64 > static_cast<uint64_t>(devMaxSmemBytes))
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: cs_rank dynamic smem "
+        "(num_stocks=" + std::to_string(numStocks) +
+        " * sizeof(T)=" + std::to_string(elemSize) + " = " +
+        std::to_string(smemBytes64) +
+        " bytes) exceeds this GPU's MAX_SHARED_MEMORY_PER_BLOCK_OPTIN (" +
+        std::to_string(devMaxSmemBytes) +
+        " bytes).  Reduce num_stocks or run on a GPU with a larger smem budget.");
+
+  if (timeLength <= 0)
+    return; // empty time chunk — nothing to launch
+
+  unsigned blockX    = static_cast<unsigned>(warpsPerCta * 32);
+  unsigned gridX     = static_cast<unsigned>(timeLength);
+  unsigned smemBytes = static_cast<unsigned>(smemBytes64);
+  checkCu(cuLaunchKernel(fn, gridX, 1, 1, blockX, 1, 1,
+                           smemBytes, stream, args, nullptr),
+           "cuLaunchKernel(cs_rank)");
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel-module / kernel-symbol helpers — read ExecutableData, mutate
+// the CUmodule and CUfunction handles the ctor is populating.
+//===----------------------------------------------------------------------===//
+
+/// Load the JIT'd cubin if non-empty; otherwise sanity-check that no
+/// kernel actually needs it (every `kind == Jit` requires a cubin).
+static void loadJitCubin(const ExecutableData &data, CUmodule &outModule) {
+  if (!data.cubin.empty()) {
+    checkCu(cuModuleLoadData(&outModule, data.cubin.data()),
+             "cuModuleLoadData");
+    return;
+  }
+  for (const auto &k : data.kernels)
+    if (k.kind == KernelKind::Jit)
+      throw std::runtime_error(
+          "kun_cuda::Executable: JIT kernel '" + k.kernelName +
+          "' declared but no cubin supplied — this is a compile-side bug");
+}
+
+/// Lazy-load the bundled cs_rank PTX as a second CUmodule iff any
+/// kernel uses it.  The driver JITs PTX → SASS on first load (cached
+/// system-wide in ~/.nv/ComputeCache), so this is sub-ms after the
+/// first run on a given GPU.
+static void loadCsRankPtxIfNeeded(const std::vector<KernelMeta> &kernels,
+                                    CUmodule &outModule) {
+  for (const auto &k : kernels) {
+    if (k.kind != KernelKind::Jit) {
+      checkCu(cuModuleLoadData(&outModule, kun_cs_rank_ptx),
+               "cuModuleLoadData(cs_rank.ptx)");
+      return;
+    }
+  }
+}
+
+/// Pick the right CUmodule + symbol name for a kernel and resolve it.
+static CUfunction resolveOneKernelSymbol(const KernelMeta &k,
+                                          CUmodule jitModule,
+                                          CUmodule csRankModule) {
+  CUmodule mod = nullptr;
+  const char *symbol = nullptr;
+  switch (k.kind) {
+    case KernelKind::Jit:
+      mod = jitModule;
+      symbol = k.kernelName.c_str();
+      break;
+    case KernelKind::ExtCsRankF32:
+      mod = csRankModule;
+      symbol = "kun_cs_rank_f32";
+      break;
+    case KernelKind::ExtCsRankF64:
+      mod = csRankModule;
+      symbol = "kun_cs_rank_f64";
+      break;
+  }
+  CUfunction fn = nullptr;
+  checkCu(cuModuleGetFunction(&fn, mod, symbol),
+           "cuModuleGetFunction");
+  return fn;
+}
+
+/// Opt every external (non-Jit) function into the device's full
+/// dynamic-smem budget up-front.  The attribute is purely a permission
+/// cap — raising it doesn't change the carveout or per-launch smem
+/// cost, so we do it eagerly here rather than per-launch.  No-op if
+/// there are no external kernels.
+static void optInExternalSmemMax(const std::vector<KernelMeta> &kernels,
+                                   const std::vector<CUfunction> &funcs) {
+  bool anyExternal = false;
+  for (const auto &k : kernels)
+    if (k.kind != KernelKind::Jit) { anyExternal = true; break; }
+  if (!anyExternal)
+    return;
+
+  CUdevice dev = 0;
+  checkCu(cuCtxGetDevice(&dev), "cuCtxGetDevice");
+  int maxOptIn = 0;
+  checkCu(cuDeviceGetAttribute(
+              &maxOptIn,
+              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev),
+           "cuDeviceGetAttribute(MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)");
+  for (size_t i = 0; i < funcs.size(); ++i) {
+    if (kernels[i].kind == KernelKind::Jit) continue;
+    checkCu(cuFuncSetAttribute(
+                funcs[i],
+                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                maxOptIn),
+             "cuFuncSetAttribute(MAX_DYNAMIC_SHARED_SIZE_BYTES)");
+  }
+}
+
+/// Per-kernel-kind I/O arity check.  External cs_rank kernels have a
+/// fixed signature `(T_in, T_out)` — the kernel signature is set in
+/// stone by `kernels/cs_rank.cu`, so we know the wiring is wrong (not
+/// just unusual) the moment we see any other shape.  Static property
+/// of the graph, so done at construction.
+static void validateKernelIO(const std::vector<KernelMeta> &kernels,
+                               const std::vector<std::vector<int>> &kernelInputBufs,
+                               const std::vector<std::vector<int>> &kernelOutputBufs) {
+  for (size_t i = 0; i < kernels.size(); ++i) {
+    const auto &k    = kernels[i];
+    const size_t nIn = kernelInputBufs[i].size();
+    const size_t nOut = kernelOutputBufs[i].size();
+    switch (k.kind) {
+      case KernelKind::Jit:
+        // JIT kernels can have any arity — they're whatever the MLIR
+        // pipeline emitted.
+        break;
+      case KernelKind::ExtCsRankF32:
+      case KernelKind::ExtCsRankF64:
+        if (nIn != 1 || nOut != 1)
+          throw std::runtime_error(
+              "kun_cuda::Executable: cs_rank kernel '" + k.kernelName +
+              "' must have exactly 1 input and 1 output (have " +
+              std::to_string(nIn) + " / " + std::to_string(nOut) + ")");
+        break;
+    }
+  }
+}
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -395,14 +635,24 @@ Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
   plan_->intermediateBufToSlot = std::move(slots.intermediateBufToSlot);
   plan_->peakIntermediateSlots = slots.peakIntermediateSlots;
 
-  // ── Load the cubin and resolve every kernel symbol ───────────────
-  checkCu(cuModuleLoadData(&cuModule_, data_.cubin.data()),
-           "cuModuleLoadData");
+  // ── Per-kernel I/O arity validation ──────────────────────────────
+  // Catches mis-wired external kernels (which have a fixed signature)
+  // at construction time, well before the launch path.
+  validateKernelIO(data_.kernels,
+                    plan_->kernelInputBufs, plan_->kernelOutputBufs);
+
+  // ── Load cubin(s) + resolve every kernel symbol ──────────────────
+  loadJitCubin(data_, cuModule_);
+  loadCsRankPtxIfNeeded(data_.kernels, csRankModule_);
+
   cuFuncs_.resize(data_.kernels.size(), nullptr);
-  for (size_t i = 0; i < data_.kernels.size(); ++i)
-    checkCu(cuModuleGetFunction(&cuFuncs_[i], cuModule_,
-                                 data_.kernels[i].kernelName.c_str()),
-             "cuModuleGetFunction");
+  for (size_t i = 0; i < data_.kernels.size(); ++i) {
+    cuFuncs_[i] = resolveOneKernelSymbol(data_.kernels[i],
+                                          cuModule_, csRankModule_);
+  }
+
+  // ── Opt external kernels into the device's full dynamic smem cap ──
+  optInExternalSmemMax(data_.kernels, cuFuncs_);
 }
 
 Executable::~Executable() {
@@ -411,6 +661,8 @@ Executable::~Executable() {
   freeSlotPool();
   if (cuModule_)
     cuModuleUnload(cuModule_);
+  if (csRankModule_)
+    cuModuleUnload(csRankModule_);
 }
 
 void Executable::freeSlotPool() {
@@ -458,69 +710,29 @@ int Executable::peakIntermediateSlots() const noexcept {
 void Executable::launchOnStream(
     int64_t timeLength, int64_t numStocks,
     const std::vector<std::pair<std::string, uintptr_t>> &args,
-    CUstream stream) {
-  // 1.  Shape sanity (kernel signature is i32 i32).
+    CUstream stream,
+    int devMaxSmemBytes) {
+  // ── Shape sanity (kernel signature is i32, i32) ──────────────────
   if (timeLength > std::numeric_limits<int32_t>::max() ||
       numStocks  > std::numeric_limits<int32_t>::max() ||
       timeLength < 0 || numStocks < 0)
     throw std::runtime_error(
         "kun_cuda::launchOnStream: time_length / num_stocks out of i32 "
         "range (kernel signature uses i32, i32)");
-
-  // 2.  Allocate intermediate slot pool if needed.
-  ensureSlotPool(timeLength, numStocks);
-
-  // 3.  Resolve user args into the flat buffer table.  Two hash lookups
-  //     per user arg, that's it.
-  std::vector<uintptr_t> bufPtrs(plan_->numBuffers, 0);
-  std::vector<bool>      filled(plan_->numBuffers, false);
-
-  for (const auto &kv : args) {
-    auto itIn = plan_->graphInputIdx.find(kv.first);
-    auto itOut = plan_->graphOutputIdx.find(kv.first);
-    int idx = -1;
-    if (itIn != plan_->graphInputIdx.end())
-      idx = itIn->second;
-    else if (itOut != plan_->graphOutputIdx.end())
-      idx = itOut->second;
-    else
-      throw std::runtime_error(
-          "kun_cuda::launchOnStream: unexpected argument '" + kv.first +
-          "' (expected: " + joinNames(data_.graphInputs) + " | " +
-          joinNames(data_.graphOutputs) + ")");
-    bufPtrs[idx] = kv.second;
-    filled[idx] = true;
-  }
-
-  // 4.  Confirm every graph input + output was supplied.
-  for (int i = 0; i < plan_->numGraphInputs + plan_->numGraphOutputs; ++i) {
-    if (filled[i]) continue;
-    std::string missing;
-    for (auto &kv : plan_->graphInputIdx)  if (kv.second == i) missing = kv.first;
-    if (missing.empty())
-      for (auto &kv : plan_->graphOutputIdx) if (kv.second == i) missing = kv.first;
+  if (data_.warpsPerCta <= 0)
     throw std::runtime_error(
-        "kun_cuda::launchOnStream: missing argument '" + missing + "'");
-  }
+        "kun_cuda::launchOnStream: warps_per_cta is " +
+        std::to_string(data_.warpsPerCta));
 
-  // 5.  Fill intermediate slots from the pre-allocated pool.
-  for (int i = plan_->numGraphInputs + plan_->numGraphOutputs;
-        i < plan_->numBuffers; ++i) {
-    int slot = plan_->intermediateBufToSlot[i];
-    bufPtrs[i] = slotBufs_[slot];
-  }
+  // ── Grow / reuse the intermediate slot pool for this shape ───────
+  ensureSlotPool(timeLength, numStocks);
 
-  // 6.  Launch each kernel in topo order on `stream`.  Async — the
-  //     caller (Executor) owns waiting via cuStreamSynchronize.
-  unsigned blockX = static_cast<unsigned>(data_.warpsPerCta * 32);
-  if (blockX == 0)
-    throw std::runtime_error(
-        "kun_cuda::launchOnStream: warps_per_cta is 0");
-  uint64_t stocksPerBlock =
-      static_cast<uint64_t>(blockX) * static_cast<uint64_t>(data_.vectorSize);
-  unsigned gridX = static_cast<unsigned>(
-      (static_cast<uint64_t>(numStocks) + stocksPerBlock - 1) / stocksPerBlock);
+  // ── Map user args + slot pool into a flat buffer-index → ptr ─────
+  const std::vector<uintptr_t> bufPtrs =
+      resolveBufferPointers(*plan_, data_, args, slotBufs_);
 
+  // ── Launch each kernel in topo order on `stream`.  Async — the
+  //    caller (Executor) waits via cuStreamSynchronize. ─────────────
   int32_t timeLenI32   = static_cast<int32_t>(timeLength);
   int32_t numStocksI32 = static_cast<int32_t>(numStocks);
 
@@ -528,25 +740,28 @@ void Executable::launchOnStream(
     const auto &ins  = plan_->kernelInputBufs[kIdx];
     const auto &outs = plan_->kernelOutputBufs[kIdx];
 
+    // Build the argv: (i32 T, i32 S, ins..., outs...) — same shape
+    // for Jit and external kernels.
     std::vector<CUdeviceptr> ptrs;
     ptrs.reserve(ins.size() + outs.size());
     for (int b : ins)  ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
     for (int b : outs) ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
-
     std::vector<void *> argPtrs;
     argPtrs.reserve(2 + ptrs.size());
     argPtrs.push_back(&timeLenI32);
     argPtrs.push_back(&numStocksI32);
     for (auto &p : ptrs) argPtrs.push_back(&p);
 
-    // sharedMemBytes = 0 — shared memory is static (declared as
-    // `llvm.mlir.global addr_space=3` and allocated by ptxas into the
-    // cubin's `.shared` section); the dynamic-smem launch parameter does
-    // not apply.
-    checkCu(cuLaunchKernel(cuFuncs_[kIdx], gridX, 1, 1, blockX, 1, 1,
-                             /*sharedMemBytes=*/0, stream,
-                             argPtrs.data(), nullptr),
-             "cuLaunchKernel");
+    const auto &meta = data_.kernels[kIdx];
+    if (meta.kind == KernelKind::Jit) {
+      launchJitKernel(cuFuncs_[kIdx], numStocks,
+                       data_.warpsPerCta, data_.vectorSize,
+                       argPtrs.data(), stream);
+    } else {
+      launchExtCsRankKernel(cuFuncs_[kIdx], meta.kind, meta.kernelName,
+                              timeLength, numStocks, data_.warpsPerCta,
+                              devMaxSmemBytes, argPtrs.data(), stream);
+    }
   }
 }
 
@@ -554,14 +769,36 @@ void Executable::launchOnStream(
 // Executor — thin CUstream wrapper, mirrors the CPU `kun::Executor` shape.
 //===----------------------------------------------------------------------===//
 
-Executor::Executor() : stream_(nullptr) {}
-Executor::Executor(CUstream stream) : stream_(stream) {}
+namespace {
+/// Query the current CUcontext's device for
+/// MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.  Returns 0 if no context is
+/// current — the Executor accepts that and the launch path will only
+/// trip the check if the executable actually has external cs_rank
+/// kernels (in which case the user must have a context anyway).
+int queryDevMaxSmemBytes() {
+  CUcontext cur = nullptr;
+  if (cuCtxGetCurrent(&cur) != CUDA_SUCCESS || !cur) return 0;
+  CUdevice dev = 0;
+  if (cuCtxGetDevice(&dev) != CUDA_SUCCESS) return 0;
+  int v = 0;
+  if (cuDeviceGetAttribute(
+          &v, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev)
+      != CUDA_SUCCESS)
+    return 0;
+  return v;
+}
+} // namespace
+
+Executor::Executor()
+    : stream_(nullptr), devMaxSmemBytes_(queryDevMaxSmemBytes()) {}
+Executor::Executor(CUstream stream)
+    : stream_(stream), devMaxSmemBytes_(queryDevMaxSmemBytes()) {}
 Executor::~Executor() = default;
 
 void Executor::runGraph(
     Executable &exe, int64_t timeLength, int64_t numStocks,
     const std::vector<std::pair<std::string, uintptr_t>> &args) {
-  exe.launchOnStream(timeLength, numStocks, args, stream_);
+  exe.launchOnStream(timeLength, numStocks, args, stream_, devMaxSmemBytes_);
 }
 
 void Executor::synchronize() {
diff --git a/mlir/lib/KunCuda/kernels/cs_rank.cu b/mlir/lib/KunCuda/kernels/cs_rank.cu
new file mode 100644
index 0000000..abdeca1
--- /dev/null
+++ b/mlir/lib/KunCuda/kernels/cs_rank.cu
@@ -0,0 +1,123 @@
+// cs_rank.cu — cross-sectional rank kernel, pre-compiled to PTX and
+// embedded into libKunCudaRuntime as a separate CUmodule.
+//
+// Signature matches the project's launch convention so the executor can
+// pass the same `(i32 time_length, i32 num_stocks, in_ptr, out_ptr)`
+// arg tuple it uses for JIT'd kernels.  Grid/block/smem are chosen by
+// the executor at launch (time-major grid, one CTA per timestep,
+// dynamic smem = num_stocks * sizeof(T)).
+//
+// Algorithm — pairwise O(N^2):
+//   For each stock i with non-NaN value v,
+//     less  = #{ j : !isnan(u[j]) && u[j]  < v }
+//     equal = #{ j : !isnan(u[j]) && u[j] == v }    (includes i itself)
+//     valid = #{ j : !isnan(u[j]) }
+//   Output is the average-rank normalised to (0, 1]:
+//     out = (2*less + equal + 1) / (2 * valid)
+//   This matches cpp/Kun/Rank.hpp's equal_range formula exactly:
+//     sum = (start + end + 1) * (end - start) / 2
+//     out = sum / (end - start) / valid
+//   with start = less, end = less + equal.
+//
+// NaN policy: NaN inputs produce NaN outputs and don't contribute to
+// any count.
+
+#include <cuda_runtime.h>
+#include <math_constants.h>
+
+// Dynamic shared memory base.  Declared at file scope (no anonymous
+// namespace) so it gets a stable, internal symbol rather than nvcc's
+// "extern .shared" with mangled-namespace linkage — the latter survives
+// to the PTX as an unresolved extern, which the driver JIT cannot link
+// when this PTX is loaded standalone via cuModuleLoadData.  Both
+// kun_cs_rank_f32 and kun_cs_rank_f64 reinterpret_cast<T*>(raw_smem)
+// from a single base — fine, since each kernel launch supplies its own
+// physical smem allocation.
+extern __shared__ unsigned char kun_cs_rank_smem[];
+
+namespace {
+
+template <typename T>
+__device__ static inline bool kun_isnan(T x);
+
+template <>
+__device__ inline bool kun_isnan<float>(float x) { return isnan(x); }
+
+template <>
+__device__ inline bool kun_isnan<double>(double x) { return isnan(x); }
+
+template <typename T>
+__device__ static inline T kun_nan();
+
+template <>
+__device__ inline float kun_nan<float>() { return CUDART_NAN_F; }
+
+template <>
+__device__ inline double kun_nan<double>() { return CUDART_NAN; }
+
+// Templated body — one CTA per timestep, threads cooperate across the
+// cross-section.
+template <typename T>
+__device__ static void cs_rank_body(const T* __restrict__ in,
+                                    T* __restrict__ out,
+                                    int time_length,
+                                    int num_stocks) {
+    int t = blockIdx.x;
+    if (t >= time_length) return;
+
+    T* smem = reinterpret_cast<T*>(kun_cs_rank_smem);
+
+    const T* row_in  = in  + static_cast<size_t>(t) * num_stocks;
+    T*       row_out = out + static_cast<size_t>(t) * num_stocks;
+
+    // 1) Cooperative load of the entire cross-section into smem.
+    for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+        smem[i] = row_in[i];
+    }
+    __syncthreads();
+
+    // 2) Per-stock pairwise count.  Each thread owns a stride of stocks.
+    for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+        T v = smem[i];
+        if (kun_isnan<T>(v)) {
+            row_out[i] = kun_nan<T>();
+            continue;
+        }
+
+        int less  = 0;
+        int equal = 0;
+        int valid = 0;
+        for (int j = 0; j < num_stocks; ++j) {
+            T u = smem[j];
+            int is_valid = !kun_isnan<T>(u);
+            valid += is_valid;
+            less  += (is_valid & (u <  v));
+            equal += (is_valid & (u == v));
+        }
+
+        if (valid == 0) {
+            row_out[i] = kun_nan<T>();
+            continue;
+        }
+        // Average-rank percentile, matching the CPU reference.
+        T num = static_cast<T>(2 * less + equal + 1);
+        T den = static_cast<T>(2 * valid);
+        row_out[i] = num / den;
+    }
+}
+
+} // anonymous namespace
+
+extern "C" __global__
+void kun_cs_rank_f32(int time_length, int num_stocks,
+                     const float* __restrict__ in,
+                     float* __restrict__ out) {
+    cs_rank_body<float>(in, out, time_length, num_stocks);
+}
+
+extern "C" __global__
+void kun_cs_rank_f64(int time_length, int num_stocks,
+                     const double* __restrict__ in,
+                     double* __restrict__ out) {
+    cs_rank_body<double>(in, out, time_length, num_stocks);
+}
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
index 52ef710..91632b8 100644
--- a/mlir/lib/KunGpu/PtxBackend.cpp
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -178,10 +178,15 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
     kernels.push_back(std::move(km));
     return WalkResult::advance();
   });
+
+  // No JIT kernels at all is legal — every partition could be an
+  // externally-dispatched kernel (e.g. a graph that is only cs_rank).
+  // In that case skip cubin generation entirely; the caller (pyCompile)
+  // is expected to inject external KernelMetas and to provide
+  // warpsPerCta out-of-band.
+  out = ::kun_cuda::ExecutableData{};
   if (kernels.empty())
-    return module.emitError(
-        "compileKunIrToExecutable: no llvm.func with kungpu metadata "
-        "found in the lowered module");
+    return success();
 
   // 3.  Validate target spec is graph-wide.
   auto [warpsPerCta, vectorSize] = targetSpecs.front();
@@ -206,7 +211,6 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
 
   // 5.  Populate `out`.  graphInputs / graphOutputs are caller-supplied
   //     after this returns — leave them empty.
-  out = ::kun_cuda::ExecutableData{};
   out.cubin.assign(cubin.begin(), cubin.end());
   out.warpsPerCta = warpsPerCta;
   out.vectorSize  = vectorSize;
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 2dd3402..fb5d155 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -64,13 +64,12 @@ LogicalResult MaxOp::verify() { return verifyBinaryElemwise(*this, getLhs(), get
 LogicalResult MinOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
 
 //===----------------------------------------------------------------------===//
-// Unary elemwise ops + CsRankOp — verify only
+// Unary elemwise ops — verify only
 //===----------------------------------------------------------------------===//
 
-LogicalResult AbsOp::verify()    { return success(); }
-LogicalResult LogOp::verify()    { return success(); }
-LogicalResult SignOp::verify()   { return success(); }
-LogicalResult CsRankOp::verify() { return success(); }
+LogicalResult AbsOp::verify()  { return success(); }
+LogicalResult LogOp::verify()  { return success(); }
+LogicalResult SignOp::verify() { return success(); }
 
 //===----------------------------------------------------------------------===//
 // WindowedOutputOp
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 785912f..359f7ce 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -11,7 +11,9 @@
 //   - All inputs to kunir.for_each_back_window must be ts handles (function
 //     arguments or kunir.windowed_output results).
 //   - Each yield operand of for_each_back_window must come from a reduce_* op.
-//   - kunir.cs_rank is not yet supported.
+//   - Cross-sectional kernels (cs_rank) never enter kunir — the Python
+//     frontend (CodegenMLIR._maybe_external_partition) routes them
+//     directly to a pre-compiled CUmodule bundled with the runtime.
 //
 //===----------------------------------------------------------------------===//
 
@@ -165,6 +167,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   funcOp.setFunctionTypeAttr(
       TypeAttr::get(FunctionType::get(ctx, newArgTys, newRetTys)));
 
+
   // ------------------------------------------------------------------
   // 2. Snapshot original ops and find the original return.
   // ------------------------------------------------------------------
@@ -350,9 +353,6 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       return success();
     }
 
-    if (isa<CsRankOp>(op)) {
-      return op.emitError("kunir-to-kungpu: cs_rank lowering not yet implemented");
-    }
     return op.emitError("kunir-to-kungpu: unhandled op in outer block");
   };
 
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 03d9c43..fc251e3 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -139,7 +139,6 @@ class IRBuilder {
   Value absOp(Value x)  { return makeUn<kunir::AbsOp>(x); }
   Value logOp(Value x)  { return makeUn<kunir::LogOp>(x); }
   Value signOp(Value x) { return makeUn<kunir::SignOp>(x); }
-  Value csRankOp(Value x) { return makeUn<kunir::CsRankOp>(x); }
 
   // ── Windowed buffer materialization ───────────────────────────────
   Value windowedOutputOp(Value x, int64_t length) {
@@ -312,7 +311,6 @@ void registerIRBuilder(py::module &m) {
       .def("abs",    &IRBuilder::absOp,    py::arg("x"))
       .def("log",    &IRBuilder::logOp,    py::arg("x"))
       .def("sign",   &IRBuilder::signOp,   py::arg("x"))
-      .def("cs_rank", &IRBuilder::csRankOp, py::arg("x"))
 
       // Windowed materialization
       .def("windowed_output", &IRBuilder::windowedOutputOp,
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 6d64332..09e7db0 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -157,6 +157,30 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
   return out;
 }
 
+/// Parse one Python `external_kernels=[...]` entry into a KernelMeta.
+/// Expected dict shape:
+///   {"name": str, "kind": str, "inputs": [str...], "outputs": [str...]}
+/// where `kind` is one of "cs_rank_f32" / "cs_rank_f64".
+static kun_cuda::KernelMeta parseExternalKernel(py::handle obj) {
+  py::dict d = obj.cast<py::dict>();
+  kun_cuda::KernelMeta km;
+  km.kernelName = d["name"].cast<std::string>();
+  std::string kind = d["kind"].cast<std::string>();
+  if (kind == "cs_rank_f32")
+    km.kind = kun_cuda::KernelKind::ExtCsRankF32;
+  else if (kind == "cs_rank_f64")
+    km.kind = kun_cuda::KernelKind::ExtCsRankF64;
+  else
+    throw std::runtime_error(
+        "KunMLIR.compile: unknown external kernel kind '" + kind +
+        "' (supported: cs_rank_f32, cs_rank_f64)");
+  for (py::handle n : d["inputs"].cast<py::iterable>())
+    km.inputNames.push_back(n.cast<std::string>());
+  for (py::handle n : d["outputs"].cast<py::iterable>())
+    km.outputNames.push_back(n.cast<std::string>());
+  return km;
+}
+
 static std::unique_ptr<kun_cuda::Executable>
 pyCompile(PyModule &pm,
             const std::vector<std::string> &graphInputs,
@@ -164,7 +188,9 @@ pyCompile(PyModule &pm,
             const std::string &gpuArch,
             const std::string &targetTriple,
             const std::string &targetFeatures, unsigned optLevel,
-            const std::string &toolkitPath) {
+            const std::string &toolkitPath,
+            py::list externalKernels,
+            int warpsPerCta) {
   if (graphInputs.empty())
     throw std::runtime_error(
         "KunMLIR.compile: graph_inputs cannot be empty");
@@ -182,6 +208,34 @@ pyCompile(PyModule &pm,
   kun_cuda::ExecutableData data;
   if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), opts, data)))
     throw std::runtime_error("KunMLIR.compile failed");
+
+  // Append external (pre-compiled, runtime-dispatched) kernels.  The
+  // MLIR pipeline never saw them; they're fabricated here from the
+  // descriptor list the Python frontend collected.
+  for (py::handle obj : externalKernels)
+    data.kernels.push_back(parseExternalKernel(obj));
+
+  if (data.kernels.empty())
+    throw std::runtime_error(
+        "KunMLIR.compile: no kernels (neither MLIR-emitted nor "
+        "external) — refusing to build an empty Executable");
+
+  // No JIT kernels → `compileKunIrToExecutable` left warpsPerCta at
+  // its default of 1.  Override with the caller-supplied value so the
+  // external launch path's blockDim is right.  When there are JIT
+  // kernels they fix warpsPerCta via their kungpu.target_spec, and we
+  // trust that over the parameter (and ignore the parameter).
+  bool anyJit = false;
+  for (const auto &k : data.kernels)
+    if (k.kind == kun_cuda::KernelKind::Jit) { anyJit = true; break; }
+  if (!anyJit) {
+    if (warpsPerCta <= 0)
+      throw std::runtime_error(
+          "KunMLIR.compile: warps_per_cta must be positive when every "
+          "kernel is external; got " + std::to_string(warpsPerCta));
+    data.warpsPerCta = warpsPerCta;
+  }
+
   // Graph topology is a runtime concern — fill it in here, just before
   // handing off to Executable's ctor (which validates + plans).
   data.graphInputs  = graphInputs;
@@ -317,6 +371,8 @@ PYBIND11_MODULE(KunMLIR, m) {
          py::arg("target_features") = "",
          py::arg("opt_level")      = 3u,
          py::arg("toolkit_path")   = "",
+         py::arg("external_kernels") = py::list(),
+         py::arg("warps_per_cta")    = 0,
          "Compile a kunir module all the way to a loaded Executable.\n"
          "\n"
          "Pipeline: kunir → LLVM dialect → upstream `gpu-module-to-binary`\n"
diff --git a/mlir/test/kunir/basic.mlir b/mlir/test/kunir/basic.mlir
index 5a84451..364854c 100644
--- a/mlir/test/kunir/basic.mlir
+++ b/mlir/test/kunir/basic.mlir
@@ -59,17 +59,6 @@ kunir.func @test_windowed_output(%input: !kunir.ts<f32, inf>)
   kunir.return %out : !kunir.ts<f32, 10>
 }
 
-// CHECK-LABEL: kunir.func @test_cs_rank
-kunir.func @test_cs_rank(%input: !kunir.ts<f32, inf>)
-    inputs {%input = "input"}
-    outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
-    -> !kunir.ts<f32, 1> {
-  // CHECK: kunir.cs_rank
-  %ranked = kunir.cs_rank %input : !kunir.ts<f32, inf>
-  kunir.return %ranked : !kunir.ts<f32, 1>
-}
-
 // CHECK-LABEL: kunir.func @test_for_each_back_window_single
 kunir.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
     inputs {%close = "close"}
diff --git a/mlir/test/python/test_cs_rank_cuda.py b/mlir/test/python/test_cs_rank_cuda.py
new file mode 100644
index 0000000..0bd26eb
--- /dev/null
+++ b/mlir/test/python/test_cs_rank_cuda.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""End-to-end test for the cs_rank GPU dispatch path.
+
+Cross-sectional rank (`KunQuant.Op.Rank`) is special on the GPU: it
+has no kunir representation at all.  CodegenMLIR detects a cs_rank
+partition in the Python IR and routes it as an "external kernel"
+descriptor straight to the C++ binding, which fabricates a KernelMeta
+tagged with `KernelKind::ExtCsRankF*`.  At Executable construction the
+runtime loads a pre-compiled, sm_75-baseline PTX bundled inside
+libKunCudaRuntime as a second CUmodule and resolves
+`kun_cs_rank_f{32,64}` from it.  This test exercises the whole
+plumbing end-to-end:
+
+  * Python frontend skips the kunir pipeline for cs_rank partitions,
+  * the executor lazy-loads the bundled PTX and resolves the right
+    symbol,
+  * the launch uses a time-major grid + dynamic shared memory (one
+    CTA per timestep, smem = S * sizeof(T)) rather than the default
+    stock-major grid the JIT'd kernels use,
+  * the result matches the CPU `equal_range`-based reference exactly,
+    including NaN passthrough and tie averaging,
+  * a graph that mixes cs_rank with regular JIT kernels stitches up
+    correctly (cs_rank produces an intermediate consumed by an
+    elementwise kernel).
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+
+import numpy as np
+
+from KunQuant.Op import Builder, Input, Output, Rank
+from KunQuant.ops import Add
+from KunQuant.Stage import Function
+from KunQuant.jit import KunMLIR
+from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
+
+
+# ── CPU reference (matches cpp/Kun/Rank.hpp's equal_range formula) ──
+
+def _ref_cs_rank(arr: np.ndarray) -> np.ndarray:
+    """Per-row average-rank percentile in (0, 1], NaN preserved.
+
+    Matches cpp/Kun/Rank.hpp exactly:
+      sum   = (start + end + 1) * (end - start) / 2
+      out   = sum / (end - start) / num_valid
+    where [start, end) is the equal-range of the value in the sorted
+    non-NaN array.  Algebraically this equals
+      (2 * less + equal + 1) / (2 * num_valid)
+    which is what the GPU kernel computes.
+    """
+    T, S = arr.shape
+    out = np.full((T, S), np.nan, dtype=arr.dtype)
+    for t in range(T):
+        row = arr[t]
+        valid_mask = ~np.isnan(row)
+        v = row[valid_mask]
+        nv = len(v)
+        if nv == 0:
+            continue
+        sorted_v = np.sort(v)
+        ranks = np.empty(nv, dtype=arr.dtype)
+        for i, x in enumerate(v):
+            lo = np.searchsorted(sorted_v, x, side='left')
+            hi = np.searchsorted(sorted_v, x, side='right')
+            # avg rank (1-indexed) within the equal-range, divided by nv
+            ranks[i] = (lo + hi + 1) / 2.0 / nv
+        out[t][valid_mask] = ranks
+    return out
+
+
+# ── Function builders ────────────────────────────────────────────────
+
+def _build_cs_rank_only() -> Function:
+    """r = cs_rank(a) — a single cs_rank partition, no other compute."""
+    b = Builder()
+    with b:
+        Output(Rank(Input('a')), 'r')
+    return Function(b.ops, name='cs_rank_only')
+
+
+def _build_cs_rank_mixed() -> Function:
+    """out = a + cs_rank(b) — forces the partitioner to produce
+    two kernels: an external cs_rank partition, then a JIT'd
+    elementwise Add that consumes its result."""
+    b = Builder()
+    with b:
+        a = Input('a')
+        bin_ = Input('b')
+        Output(Add(a, Rank(bin_)), 'out')
+    return Function(b.ops, name='cs_rank_mixed')
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+def _dtype_pair(dtype_token: str):
+    """Map CudaCompilerConfig.dtype → (numpy dtype, label)."""
+    if dtype_token == 'float':  return np.float32, 'float32'
+    if dtype_token == 'double': return np.float64, 'float64'
+    raise ValueError(dtype_token)
+
+
+def _run_cs_rank_only(target: str, dtype_token: str, T: int, S: int,
+                       *, with_nan: bool, with_ties: bool, seed: int) -> int:
+    """Compile r = cs_rank(a), launch, and compare to the CPU
+    reference.  Asserts the partition was tagged as external (i.e.
+    kernel_names should be a single kernel that doesn't show up as a
+    typical compute kernel — but since we can't introspect kind from
+    Python directly, we lean on the correctness check to prove the
+    external path is wired up)."""
+    import cupy as cp
+
+    np_dt, label = _dtype_pair(dtype_token)
+    print(f"=== cs_rank ({label}) T={T} S={S} "
+           f"nan={with_nan} ties={with_ties} ===")
+
+    f = _build_cs_rank_only()
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
+                              dtype=dtype_token)
+    mod = to_mlir(_build_cs_rank_only(), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernel_names={exe.kernel_names}  "
+          f"num_buffers={exe.num_buffers}  "
+          f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    rng = np.random.default_rng(seed)
+    a_h = rng.standard_normal((T, S)).astype(np_dt)
+    if with_ties:
+        # Force a moderate tie population: quantize ~30% of cells.
+        tie_mask = rng.random((T, S)) < 0.3
+        a_h[tie_mask] = np.round(a_h[tie_mask] * 2) / 2  # snap to 0.5-grid
+    if with_nan:
+        # Sprinkle ~10% NaNs.  Also force a row to be all-NaN to test
+        # the valid==0 path.
+        nan_mask = rng.random((T, S)) < 0.1
+        a_h[nan_mask] = np.nan
+        a_h[0, :] = np.nan
+
+    a_d = cp.asarray(a_h)
+    out_d = cp.zeros((T, S), dtype=np_dt)
+    ex = KunMLIR.Executor()
+    ex.runGraph(exe, {'a': a_d, 'r': out_d})
+    ex.synchronize()
+    out_h = cp.asnumpy(out_d)
+
+    ref = _ref_cs_rank(a_h)
+    # NaN cells in the reference must remain NaN on the GPU.
+    nan_ref = np.isnan(ref)
+    if not np.array_equal(np.isnan(out_h), nan_ref):
+        print("  FAIL — NaN pattern mismatch", file=sys.stderr)
+        return 1
+    # Numeric cells must match exactly modulo a few ulps (the formula
+    # is the same algebraic expression on both sides).
+    atol = 1e-6 if np_dt == np.float32 else 1e-12
+    diff = np.abs(out_h[~nan_ref] - ref[~nan_ref])
+    max_abs = float(diff.max()) if diff.size else 0.0
+    if max_abs > atol:
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e}",
+                file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e}) "
+          f"on {diff.size} numeric cells")
+    return 0
+
+
+def _run_cs_rank_mixed(target: str, T: int, S: int, *, seed: int) -> int:
+    """out = a + cs_rank(b) — proves cs_rank's intermediate buffer is
+    handed off correctly to a downstream JIT'd kernel.  Forces
+    partition_factor=1 so the graph splits into >= 2 kernels."""
+    import cupy as cp
+
+    print(f"=== cs_rank-mixed (float32) T={T} S={S} ===")
+    f = _build_cs_rank_mixed()
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
+                              dtype='float', partition_factor=1)
+    mod = to_mlir(_build_cs_rank_mixed(), cfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compileit(f, cfg)
+    print(f"  kernel_names={exe.kernel_names}  "
+          f"num_kernels={exe.num_kernels}  "
+          f"num_buffers={exe.num_buffers}  "
+          f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    # The whole point: at least 2 kernels (cs_rank + downstream Add),
+    # and at least one intermediate slot threading them together.
+    assert exe.num_kernels >= 2, exe.num_kernels
+    assert exe.peak_intermediate_slots >= 1, exe.peak_intermediate_slots
+
+    rng = np.random.default_rng(seed)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out_d = cp.zeros((T, S), dtype=cp.float32)
+
+    ex = KunMLIR.Executor()
+    ex.runGraph(exe, {'a': cp.asarray(a_h),
+                       'b': cp.asarray(b_h),
+                       'out': out_d})
+    ex.synchronize()
+    out_h = cp.asnumpy(out_d)
+
+    ref = a_h + _ref_cs_rank(b_h)
+    diff = np.abs(out_h - ref)
+    max_abs = float(diff.max())
+    if max_abs > 1e-5:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} at {idx}", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} across {exe.num_kernels} kernels")
+    return 0
+
+
+# ── Entry ────────────────────────────────────────────────────────────
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("-T", "--time-length", type=int, default=8)
+    ap.add_argument("-S", "--num-stocks", type=int, default=257)
+    args = ap.parse_args()
+
+    import cupy as cp
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    # f32 — golden path
+    rc |= _run_cs_rank_only(args.target, 'float',
+                              args.time_length, args.num_stocks,
+                              with_nan=False, with_ties=False, seed=1)
+    print()
+    # f32 with ties — exercises the equal-range averaging
+    rc |= _run_cs_rank_only(args.target, 'float',
+                              args.time_length, args.num_stocks,
+                              with_nan=False, with_ties=True, seed=2)
+    print()
+    # f32 with NaN + ties — exercises every branch
+    rc |= _run_cs_rank_only(args.target, 'float',
+                              args.time_length, args.num_stocks,
+                              with_nan=True, with_ties=True, seed=3)
+    print()
+    # NOTE: the cs_rank kernel itself is templated and `kun_cs_rank_f64`
+    # is built into the embedded PTX, but the rest of the runtime
+    # (Runtime.cpp's slot pool, MlirBinding.cpp's CAI typestr check)
+    # is still float32-only.  Lifting that requires plumbing dtype
+    # through Executable / ExecutableData and is out of scope here.
+    # When that lands, this test can re-enable:
+    #
+    #   rc |= _run_cs_rank_only(args.target, 'double',
+    #                            args.time_length, args.num_stocks,
+    #                            with_nan=True, with_ties=True, seed=4)
+    print()
+    # Mixed cs_rank + Add — proves intermediate buffer flow works
+    rc |= _run_cs_rank_mixed(args.target,
+                              args.time_length, args.num_stocks, seed=5)
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 05173a72edc20ab1cdd38a27654bbef7f3b7632d Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 12 May 2026 01:57:56 -0700
Subject: [PATCH 18/59] remove cuda interface array support

---
 mlir/lib/Python/MlirBinding.cpp          | 180 ++++++++++++----
 mlir/lib/Python/dlpack.h                 |  87 ++++++++
 mlir/test/python/test_validation_cuda.py | 252 +++++++++++++++++++++++
 3 files changed, 483 insertions(+), 36 deletions(-)
 create mode 100644 mlir/lib/Python/dlpack.h
 create mode 100644 mlir/test/python/test_validation_cuda.py

diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 09e7db0..e388ef8 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -19,6 +19,7 @@
 
 #include "PyModule.h"     // shared MLIRContext + ModuleOp wrapper
 #include "IRBuilder.h"    // pybind class for programmatic kunir construction
+#include "dlpack.h"       // vendored DLPack ABI (consumer-only)
 
 #include "KunCuda/Runtime.h"
 #include "KunGpu/PtxBackend.h"
@@ -60,47 +61,122 @@ static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
 }
 
 //===----------------------------------------------------------------------===//
-// pybind glue: read CAI dict → kun_cuda::DeviceArray, build name list
+// pybind glue: read a Python GPU array via DLPack → device pointer + shape
 //===----------------------------------------------------------------------===//
 
-/// Read CAI from one Python GPU array.  Validates dtype + ndim; shape is
-/// returned to the caller for cross-array consistency checks.
+/// Result of reading one GPU array argument.  `ptr` is the device pointer
+/// the kernel will consume; `(timeLength, numStocks)` is the resolved
+/// 2-D shape used for cross-arg consistency checks.
 struct CudaArrayInfo {
   uintptr_t ptr;
-  int64_t timeLength;   ///< shape[0]
-  int64_t numStocks;    ///< shape[1]
+  int64_t   timeLength;   ///< shape[0]
+  int64_t   numStocks;    ///< shape[1]
 };
 
-static CudaArrayInfo readCudaArray(py::handle obj,
-                                     const std::string &paramName) {
-  if (!py::hasattr(obj, "__cuda_array_interface__")) {
-    throw std::runtime_error(
-        "'" + paramName +
-        "' has no __cuda_array_interface__ — pass a CuPy ndarray (or any "
-        "GPU array implementing CAI).");
-  }
-  py::dict cai = obj.attr("__cuda_array_interface__").cast<py::dict>();
-
-  py::tuple data = cai["data"].cast<py::tuple>();
-  uintptr_t ptr  = data[0].cast<uintptr_t>();
+/// DLPack-spec encoding of the executor's CUDA stream, ready to hand to
+/// `obj.__dlpack__(stream=…)`.  The protocol uses int sentinels for the
+/// two default streams and the actual `CUstream` pointer otherwise:
+///
+///   None  ⇒  producer chooses (no sync)
+///   1     ⇒  legacy default stream
+///   2     ⇒  per-thread default stream
+///   other ⇒  CUstream pointer cast to int
+///
+/// We're never "no sync" — every launch must serialise on the executor's
+/// stream — so `stream_ == nullptr` (default-stream executor) maps to 1.
+static py::object dlpackStreamArg(CUstream stream) {
+  if (stream == nullptr)
+    return py::int_(1);
+  return py::int_(reinterpret_cast<uintptr_t>(stream));
+}
 
-  std::vector<int64_t> shape;
-  for (py::handle s : cai["shape"].cast<py::tuple>())
-    shape.push_back(s.cast<int64_t>());
-  if (shape.size() != 2) {
+/// Throws if `(shape, stridesBytes)` doesn't describe a C-contiguous
+/// 2-D buffer with `elemSize`-byte elements.  `stridesBytes == nullptr`
+/// is the "default row-major" case (always contiguous).
+static void requireRowMajorContiguous2D(const std::string &paramName,
+                                          const int64_t *shape,
+                                          const int64_t *stridesBytes,
+                                          int64_t elemSize) {
+  if (!stridesBytes)
+    return;
+  const int64_t innerStride = elemSize;
+  const int64_t outerStride = elemSize * shape[1];
+  if (stridesBytes[0] != outerStride || stridesBytes[1] != innerStride) {
     std::stringstream ss;
-    ss << "'" << paramName << "' must be 2-D (got " << shape.size() << "-D)";
+    ss << "'" << paramName << "' is not C-contiguous: strides=("
+       << stridesBytes[0] << ", " << stridesBytes[1] << ") bytes, "
+       << "expected (" << outerStride << ", " << innerStride
+       << ") for shape (" << shape[0] << ", " << shape[1] << ")";
     throw std::runtime_error(ss.str());
   }
+}
+
+/// Read `__dlpack__(stream=…)` — the cross-framework GPU array protocol
+/// implemented by CuPy / PyTorch / JAX / TensorFlow.  Validates every
+/// field the kernel relies on and threads the executor's stream so the
+/// producer can insert the needed cross-stream sync.
+///
+/// Memory lifecycle: `__dlpack__()` returns a PyCapsule named "dltensor"
+/// owning a `DLManagedTensor`; when the capsule is GC'd, its destructor
+/// calls the producer's `deleter`.  We grab the fields we need and let
+/// the capsule fall out of scope at function exit — the underlying
+/// tensor stays alive because the user is still holding `obj`.
+static CudaArrayInfo readDLPack(py::handle obj, const std::string &paramName,
+                                  const py::object &streamArg) {
+  if (!py::hasattr(obj, "__dlpack__"))
+    throw std::runtime_error(
+        "'" + paramName + "' does not implement __dlpack__ — pass a CuPy "
+        "ndarray, a PyTorch CUDA tensor, a JAX device array, or any other "
+        "object exporting the DLPack protocol.");
+
+  py::object capsule = obj.attr("__dlpack__")(py::arg("stream") = streamArg);
+  void *raw = PyCapsule_GetPointer(capsule.ptr(), "dltensor");
+  if (!raw) {
+    PyErr_Clear();
+    throw std::runtime_error(
+        "'" + paramName + "' __dlpack__() did not return a PyCapsule named "
+        "'dltensor' (consumed capsule?  wrong producer?)");
+  }
+  const DLManagedTensor *mt = reinterpret_cast<const DLManagedTensor *>(raw);
+  const DLTensor &t = mt->dl_tensor;
+
+  // ── device: only CUDA (managed counts as CUDA-addressable) ──────────
+  if (t.device.device_type != kDLCUDA &&
+      t.device.device_type != kDLCUDAManaged)
+    throw std::runtime_error(
+        "'" + paramName + "' is on DLPack device type " +
+        std::to_string(static_cast<int>(t.device.device_type)) +
+        " — only CUDA (=2) and CUDAManaged (=13) are supported");
 
-  std::string typestr = cai["typestr"].cast<std::string>();
-  if (typestr != "<f4" && typestr != "|f4" && typestr != "=f4") {
-    throw std::runtime_error("'" + paramName +
-                              "' must be float32 little-endian (typestr "
-                              "'<f4'); got '" +
-                              typestr + "'");
+  // ── ndim ────────────────────────────────────────────────────────────
+  if (t.ndim != 2)
+    throw std::runtime_error(
+        "'" + paramName + "' must be 2-D (got " +
+        std::to_string(t.ndim) + "-D)");
+
+  // ── dtype: kDLFloat, 32-bit, 1 lane ─────────────────────────────────
+  if (t.dtype.code != kDLFloat || t.dtype.bits != 32 || t.dtype.lanes != 1)
+    throw std::runtime_error(
+        "'" + paramName + "' DLPack dtype is (code=" +
+        std::to_string(static_cast<int>(t.dtype.code)) +
+        ", bits=" + std::to_string(static_cast<int>(t.dtype.bits)) +
+        ", lanes=" + std::to_string(static_cast<int>(t.dtype.lanes)) +
+        ") — need float32 (kDLFloat, 32, 1)");
+
+  // ── strides: NULL = row-major contiguous; else validate.  DLPack
+  //    strides are in *elements*, not bytes — convert before checking.
+  if (t.strides) {
+    int64_t sb[2] = {t.strides[0] * 4, t.strides[1] * 4};
+    requireRowMajorContiguous2D(paramName, t.shape, sb, /*elemSize=*/4);
   }
-  return CudaArrayInfo{ptr, shape[0], shape[1]};
+
+  // ── data pointer (apply byte_offset before handing to kernel) ───────
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(t.data) + t.byte_offset;
+  if (ptr == 0)
+    throw std::runtime_error(
+        "'" + paramName + "' DLPack data pointer is null");
+
+  return CudaArrayInfo{ptr, t.shape[0], t.shape[1]};
 }
 
 /// Walk the user's {name → cuda_array} dict, validate that every named
@@ -115,7 +191,10 @@ struct CollectedArgs {
 };
 
 static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
-                                   py::dict pyArgs) {
+                                   py::dict pyArgs,
+                                   const py::object &streamArg) {
+  // Graph inputs come first, then outputs — same as the buffer-table
+  // layout the runtime expects.
   std::vector<std::string> ordered;
   ordered.reserve(exe.graphInputs().size() + exe.graphOutputs().size());
   for (auto &n : exe.graphInputs())  ordered.push_back(n);
@@ -126,19 +205,43 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
   CollectedArgs out;
   out.args.reserve(ordered.size());
 
+  // Reject extras up-front so the error message points at the offending
+  // name (the per-name loop below would otherwise just complain about a
+  // missing graph_input/output, which is misleading when the real issue
+  // is a typo'd key).
+  if (pyArgs.size() > ordered.size()) {
+    for (auto kv : pyArgs) {
+      std::string key = py::cast<std::string>(kv.first);
+      bool known = false;
+      for (auto &n : ordered) if (n == key) { known = true; break; }
+      if (!known) {
+        std::string expected;
+        for (size_t j = 0; j < ordered.size(); ++j) {
+          if (j) expected += ", ";
+          expected += ordered[j];
+        }
+        throw std::runtime_error(
+            "launch: unexpected argument '" + key +
+            "' (kernel expects: " + expected + ")");
+      }
+    }
+  }
+
   bool first = true;
-  for (const std::string &name : ordered) {
+  for (size_t i = 0; i < ordered.size(); ++i) {
+    const std::string &name = ordered[i];
+
     py::object key = py::str(name);
     if (!pyArgs.contains(key)) {
       std::string expected;
-      for (size_t i = 0; i < ordered.size(); ++i) {
-        if (i) expected += ", ";
-        expected += ordered[i];
+      for (size_t j = 0; j < ordered.size(); ++j) {
+        if (j) expected += ", ";
+        expected += ordered[j];
       }
       throw std::runtime_error("launch: missing argument '" + name +
                                 "' (kernel expects: " + expected + ")");
     }
-    CudaArrayInfo info = readCudaArray(pyArgs[key], name);
+    CudaArrayInfo info = readDLPack(pyArgs[key], name, streamArg);
     if (first) {
       out.timeLength = info.timeLength;
       out.numStocks  = info.numStocks;
@@ -344,7 +447,12 @@ PYBIND11_MODULE(KunMLIR, m) {
       .def("runGraph",
           [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
               py::dict pyArgs) {
-            auto c = collectArgs(exe, pyArgs);
+            // Thread the executor's stream into __dlpack__(stream=…)
+            // so producers (CuPy / PyTorch / JAX / TF) can insert the
+            // cross-stream sync needed for data-readiness on our
+            // launch stream.
+            py::object streamArg = dlpackStreamArg(e.stream());
+            auto c = collectArgs(exe, pyArgs, streamArg);
             e.runGraph(exe, c.timeLength, c.numStocks, c.args);
           },
           py::arg("exe"), py::arg("args"),
diff --git a/mlir/lib/Python/dlpack.h b/mlir/lib/Python/dlpack.h
new file mode 100644
index 0000000..b133e18
--- /dev/null
+++ b/mlir/lib/Python/dlpack.h
@@ -0,0 +1,87 @@
+//===- dlpack.h - Minimal vendored DLPack ABI (consumer-only) ----------===//
+//
+// Trimmed subset of dmlc/dlpack v0.8.  We only need to *consume* a
+// `DLManagedTensor` produced by CuPy / PyTorch / JAX via the
+// `__dlpack__()` protocol, so this header omits the producer-side
+// helpers and the newer versioned form.  Vendored to keep the build
+// dependency-free; full spec lives at https://github.com/dmlc/dlpack.
+//
+// Original license: Apache-2.0.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// `DLDeviceType` — where the tensor data sits.  We accept kDLCUDA and
+/// kDLCUDAManaged; everything else (CPU, ROCm, Metal, …) is rejected.
+typedef enum {
+  kDLCPU = 1,
+  kDLCUDA = 2,
+  kDLCUDAHost = 3,
+  kDLOpenCL = 4,
+  kDLVulkan = 7,
+  kDLMetal = 8,
+  kDLVPI = 9,
+  kDLROCM = 10,
+  kDLROCMHost = 11,
+  kDLExtDev = 12,
+  kDLCUDAManaged = 13,
+  kDLOneAPI = 14,
+  kDLWebGPU = 15,
+  kDLHexagon = 16,
+} DLDeviceType;
+
+typedef struct {
+  DLDeviceType device_type;
+  int32_t      device_id;
+} DLDevice;
+
+/// `DLDataTypeCode` — element-kind dimension of the dtype triple.
+typedef enum {
+  kDLInt = 0,
+  kDLUInt = 1,
+  kDLFloat = 2,
+  kDLOpaqueHandle = 3,
+  kDLBfloat = 4,
+  kDLComplex = 5,
+  kDLBool = 6,
+} DLDataTypeCode;
+
+/// (code, bits, lanes).  E.g. f32 = {kDLFloat, 32, 1}; f64 = {kDLFloat, 64, 1}.
+typedef struct {
+  uint8_t  code;
+  uint8_t  bits;
+  uint16_t lanes;
+} DLDataType;
+
+/// Plain tensor descriptor — pointer + shape + dtype + device.
+typedef struct {
+  void       *data;
+  DLDevice    device;
+  int32_t     ndim;
+  DLDataType  dtype;
+  int64_t    *shape;
+  int64_t    *strides;       ///< NULL → row-major contiguous
+  uint64_t    byte_offset;
+} DLTensor;
+
+/// The wrapper exchanged via the unversioned PyCapsule named "dltensor".
+/// The capsule's PyCapsule_Destructor calls `deleter(self)` when it is
+/// GC'd — unless the consumer renamed the capsule to "used_dltensor",
+/// in which case the destructor is a no-op and the consumer must call
+/// `deleter` itself.
+typedef struct DLManagedTensor {
+  DLTensor dl_tensor;
+  void    *manager_ctx;
+  void   (*deleter)(struct DLManagedTensor *self);
+} DLManagedTensor;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/mlir/test/python/test_validation_cuda.py b/mlir/test/python/test_validation_cuda.py
new file mode 100644
index 0000000..ccdeafd
--- /dev/null
+++ b/mlir/test/python/test_validation_cuda.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""Negative tests for the KunMLIR launch-time validation path.
+
+The runtime consumes every input/output via DLPack (the protocol
+implemented by CuPy / PyTorch / JAX / TensorFlow).  This file
+exercises:
+
+  * DLPack field validation — wrong dtype, wrong ndim, non-contiguous
+    strided view, host-only ndarray (DLPack CPU device), object that
+    implements neither protocol at all.
+  * Graph-arg checks — missing kwarg, unknown kwarg, cross-arg shape
+    mismatch.
+  * cs_rank dynamic-smem cap — pick `num_stocks` exceeding the device's
+    MAX_SHARED_MEMORY_PER_BLOCK_OPTIN and assert the runtime fails with
+    the GPU-aware message instead of letting cuLaunchKernel emit a
+    generic CUDA_ERROR_INVALID_VALUE; the at-cap case must still launch.
+  * DLPack-only producer — verify it works when the object hides CAI
+    behind a wrapper, since DLPack is the path we rely on for non-CuPy
+    frameworks.
+
+`_expect_fail` returns 0 if the right error fires, 1 otherwise.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+
+import numpy as np
+
+
+# ── Fixture helpers ──────────────────────────────────────────────────
+
+def _build_elemwise_exe(cfg):
+    """Add(a, b) → out.  Workhorse for arg-validation tests."""
+    from KunQuant.Op import Builder, Input, Output
+    from KunQuant.ops import Add
+    from KunQuant.Stage import Function
+    from KunQuant.jit.cuda import compileit
+    b = Builder()
+    with b:
+        a = Input("a"); bb = Input("b")
+        Output(Add(a, bb), "out")
+    f = Function(b.ops, name="addk")
+    return compileit(f, cfg)
+
+
+def _build_cs_rank_exe(cfg):
+    """cs_rank(a) → r.  Used for the smem-cap test."""
+    from KunQuant.Op import Builder, Input, Output, Rank
+    from KunQuant.Stage import Function
+    from KunQuant.jit.cuda import compileit
+    b = Builder()
+    with b:
+        Output(Rank(Input("a")), "r")
+    f = Function(b.ops, name="csr")
+    return compileit(f, cfg)
+
+
+def _expect_fail(label, fn, needle):
+    print(f"  {label} ...", end=" ", flush=True)
+    try:
+        fn()
+    except Exception as e:
+        msg = str(e)
+        if needle in msg:
+            print(f"ok (raised: {msg.splitlines()[0][:100]})")
+            return 0
+        print(f"FAIL — wrong message: {msg!r}", file=sys.stderr)
+        return 1
+    print("FAIL — no exception raised", file=sys.stderr)
+    return 1
+
+
+# ── DLPack / arg-validation test set ────────────────────────────────
+
+def run_validation_tests(target):
+    import cupy as cp
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import CudaCompilerConfig
+
+    print("=== DLPack + arg validation ===")
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = _build_elemwise_exe(cfg)
+    ex  = KunMLIR.Executor()
+    T, S = 4, 32
+
+    rc = 0
+    a   = cp.zeros((T, S), dtype=cp.float32)
+    b   = cp.zeros((T, S), dtype=cp.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    # 1. Object implementing neither CAI nor DLPack (a plain int)
+    rc |= _expect_fail(
+        "object without __dlpack__ rejected",
+        lambda: ex.runGraph(exe, {"a": 0xdeadbeef, "b": b, "out": out}),
+        "does not implement __dlpack__")
+
+    # 2. Host numpy array — numpy is a CPU-only producer; it refuses
+    #    `stream != None` (our binding always passes the executor's
+    #    CUDA stream).  The error comes from numpy, not from us, but
+    #    the effect is what we want: host arrays can't sneak into a
+    #    GPU launch.
+    rc |= _expect_fail(
+        "host numpy array rejected (CPU producer)",
+        lambda: ex.runGraph(exe, {"a": np.zeros((T, S), dtype=np.float32),
+                                    "b": b, "out": out}),
+        "stream")
+
+    # 3. Wrong dtype: float64
+    rc |= _expect_fail(
+        "f64 dtype rejected",
+        lambda: ex.runGraph(exe, {"a": cp.zeros((T, S), dtype=cp.float64),
+                                    "b": b, "out": out}),
+        "need float32")
+
+    # 4. Wrong ndim: 1-D
+    rc |= _expect_fail(
+        "1-D array rejected",
+        lambda: ex.runGraph(exe, {"a": cp.zeros((T*S,), dtype=cp.float32),
+                                    "b": b, "out": out}),
+        "must be 2-D")
+
+    # 5. Wrong ndim: 3-D
+    rc |= _expect_fail(
+        "3-D array rejected",
+        lambda: ex.runGraph(exe, {"a": cp.zeros((T, S, 1), dtype=cp.float32),
+                                    "b": b, "out": out}),
+        "must be 2-D")
+
+    # 6. Non-contiguous strided view (transpose).  (T, S) and (S, T) are
+    #    different shapes, so build matching transposed b/out too.
+    a_t = a.T                                          # (S, T) view of (T, S)
+    b_t = cp.zeros((S, T), dtype=cp.float32)
+    out_t = cp.zeros((S, T), dtype=cp.float32)
+    rc |= _expect_fail(
+        "non-contiguous transposed view rejected",
+        lambda: ex.runGraph(exe, {"a": a_t, "b": b_t, "out": out_t}),
+        "not C-contiguous")
+
+    # 7. Missing graph_output
+    rc |= _expect_fail(
+        "missing graph_output rejected",
+        lambda: ex.runGraph(exe, {"a": a, "b": b}),    # no 'out'
+        "missing argument 'out'")
+
+    # 8. Shape mismatch between args
+    rc |= _expect_fail(
+        "shape mismatch rejected",
+        lambda: ex.runGraph(exe, {"a": a,
+                                    "b": cp.zeros((T, S+1), dtype=cp.float32),
+                                    "out": out}),
+        "shape mismatch")
+
+    # 9. Unknown kwarg (the hot-path skip kicks in for size == ordered,
+    #    so add a real extra to trip the strict check).
+    rc |= _expect_fail(
+        "unknown argument rejected",
+        lambda: ex.runGraph(exe, {"a": a, "b": b, "out": out, "bogus": a}),
+        "unexpected argument 'bogus'")
+
+    # 10. DLPack-only producer — wrap a cupy ndarray and hide every
+    #     attribute except __dlpack__ / __dlpack_device__.  Verifies the
+    #     binding works for objects that don't quack like CuPy (e.g.
+    #     JAX, TF, custom buffers).
+    class DLOnly:
+        def __init__(self, arr):
+            self._arr = arr
+        def __dlpack__(self, stream=None):
+            return self._arr.__dlpack__(stream=stream)
+        def __dlpack_device__(self):
+            return self._arr.__dlpack_device__()
+
+    print("  dlpack-only producer happy path ...", end=" ", flush=True)
+    try:
+        ex.runGraph(exe, {"a": DLOnly(a), "b": DLOnly(b), "out": DLOnly(out)})
+        ex.synchronize()
+        print("ok")
+    except Exception as e:
+        print(f"FAIL — DLPack-only happy path raised: {e}", file=sys.stderr)
+        rc |= 1
+
+    return rc
+
+
+# ── cs_rank smem-cap test ────────────────────────────────────────────
+
+def run_smem_cap_tests(target):
+    import cupy as cp
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import CudaCompilerConfig
+
+    print("=== cs_rank smem cap ===")
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = _build_cs_rank_exe(cfg)
+    ex  = KunMLIR.Executor()
+    rc = 0
+
+    # CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
+    dev = cp.cuda.Device(0)
+    try:
+        max_smem = cp.cuda.runtime.deviceGetAttribute(97, dev.id)
+    except Exception:
+        max_smem = 49152    # conservative fallback
+    too_many = max_smem // 4 + 1   # one stock past the float32 cap
+    print(f"  device max_smem={max_smem} bytes; using num_stocks={too_many} "
+          f"(needs {too_many*4} bytes)")
+
+    T = 2
+    a   = cp.zeros((T, too_many), dtype=cp.float32)
+    out = cp.zeros((T, too_many), dtype=cp.float32)
+    rc |= _expect_fail(
+        "smem cap exceeded → clear error",
+        lambda: ex.runGraph(exe, {"a": a, "r": out}),
+        "MAX_SHARED_MEMORY_PER_BLOCK_OPTIN")
+
+    # At-cap case must still launch (off-by-one regression guard).
+    at_limit = max_smem // 4
+    a2   = cp.zeros((T, at_limit), dtype=cp.float32)
+    out2 = cp.zeros((T, at_limit), dtype=cp.float32)
+    print(f"  at-cap launch (num_stocks={at_limit}) ...", end=" ", flush=True)
+    try:
+        ex.runGraph(exe, {"a": a2, "r": out2})
+        ex.synchronize()
+        print("ok")
+    except Exception as e:
+        print(f"FAIL — at-cap should succeed but got: {e}", file=sys.stderr)
+        rc |= 1
+    return rc
+
+
+# ── main ─────────────────────────────────────────────────────────────
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default="sm_120")
+    args = ap.parse_args()
+
+    import cupy as cp
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    rc |= run_validation_tests(args.target)
+    print()
+    rc |= run_smem_cap_tests(args.target)
+    print()
+    print("=== all tests done ===")
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From a4830f2dfdb3c92e42046873005efe8afea13a38 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 12 May 2026 21:22:48 -0700
Subject: [PATCH 19/59] nanobind

---
 mlir/lib/Python/CMakeLists.txt  |  10 ++-
 mlir/lib/Python/IRBuilder.cpp   |  68 +++++++--------
 mlir/lib/Python/IRBuilder.h     |   6 +-
 mlir/lib/Python/MlirBinding.cpp | 143 ++++++++++++++++----------------
 4 files changed, 117 insertions(+), 110 deletions(-)

diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index b19012c..fc2be19 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -1,16 +1,18 @@
 # Python binding module for the kunir → PTX → CUBIN → launch flow.
 #
-# pybind11 is added at the top-level KunQuant CMakeLists, so the
-# `pybind11_add_module` macro is already in scope.
+# nanobind is added at the top-level KunQuant CMakeLists, so the
+# `nanobind_add_module` macro is already in scope.
 
-# pybind11 modules have undefined symbols (PyObject_*, PyExc_*, …) that
+# nanobind modules have undefined symbols (PyObject_*, PyExc_*, …) that
 # the Python interpreter resolves at module-load time.  MLIR's
 # HandleLLVMOptions adds `-Wl,-z,defs` globally, which is incompatible
 # with that policy.  Strip it locally for this subdirectory.
 string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
 string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
 
-pybind11_add_module(KunMLIR SHARED
+# STABLE_ABI: single .abi3.so on CPython ≥ 3.12; falls back to per-version
+# on older Pythons.  Matches the runner binding (cpp/Python).
+nanobind_add_module(KunMLIR STABLE_ABI
   MlirBinding.cpp
   IRBuilder.cpp
   PyModule.cpp
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index fc251e3..9dc62e1 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -3,7 +3,9 @@
 #include "IRBuilder.h"
 #include "PyModule.h"
 
-#include <pybind11/stl.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -24,7 +26,7 @@
 #include <string>
 #include <vector>
 
-namespace py = pybind11;
+namespace nb = nanobind;
 using namespace mlir;
 
 namespace kun_mlir_py {
@@ -266,77 +268,77 @@ std::string typeRepr(Type t) {
 
 } // namespace
 
-void registerIRBuilder(py::module &m) {
+void registerIRBuilder(nb::module_ &m) {
   // Opaque MLIR Value / Type wrappers.  No mutating methods — just an
   // identity / repr.  They live as long as the IRBuilder + its resulting
   // PyModule.
-  py::class_<Value>(m, "Value")
+  nb::class_<Value>(m, "Value")
       .def("__repr__", [](Value v) { return "<KunMLIR.Value " + valueRepr(v) + ">"; })
       .def("__str__",  [](Value v) { return valueRepr(v); });
 
-  py::class_<Type>(m, "Type")
+  nb::class_<Type>(m, "Type")
       .def("__repr__", [](Type t) { return "<KunMLIR.Type " + typeRepr(t) + ">"; })
       .def("__str__",  [](Type t) { return typeRepr(t); });
 
-  py::class_<IRBuilder>(m, "IRBuilder",
+  nb::class_<IRBuilder>(m, "IRBuilder",
         "Stateful builder that constructs a kunir module programmatically.\n"
         "Wrap your translator around this — it's the canonical alternative "
         "to round-tripping through MLIR text via parse().")
-      .def(py::init<>())
+      .def(nb::init<>())
 
       // Type
       .def("ts_type", &IRBuilder::tsType,
-            py::arg("elem_dtype"), py::arg("lookback"),
+            nb::arg("elem_dtype"), nb::arg("lookback"),
             "Build a !kunir.ts<elem_dtype, lookback>.  lookback==0 → 'inf'.")
 
       // Function
       .def("begin_func", &IRBuilder::beginFunc,
-            py::arg("name"),
-            py::arg("input_types"), py::arg("input_names"),
-            py::arg("output_names"),
-            py::arg("occupancy"), py::arg("warps_per_cta"),
-            py::arg("smem_size"), py::arg("vector_size"),
-            py::arg("result_types"),
+            nb::arg("name"),
+            nb::arg("input_types"), nb::arg("input_names"),
+            nb::arg("output_names"),
+            nb::arg("occupancy"), nb::arg("warps_per_cta"),
+            nb::arg("smem_size"), nb::arg("vector_size"),
+            nb::arg("result_types"),
             "Open a new kunir.func.  Returns its argument Values.")
-      .def("end_func", &IRBuilder::endFunc, py::arg("return_values"),
+      .def("end_func", &IRBuilder::endFunc, nb::arg("return_values"),
             "Close the current kunir.func with a kunir.return.")
 
       // Elemwise
-      .def("add",    &IRBuilder::addOp,    py::arg("lhs"), py::arg("rhs"))
-      .def("sub",    &IRBuilder::subOp,    py::arg("lhs"), py::arg("rhs"))
-      .def("mul",    &IRBuilder::mulOp,    py::arg("lhs"), py::arg("rhs"))
-      .def("div",    &IRBuilder::divOp,    py::arg("lhs"), py::arg("rhs"))
-      .def("max",    &IRBuilder::maxOp,    py::arg("lhs"), py::arg("rhs"))
-      .def("min",    &IRBuilder::minOp,    py::arg("lhs"), py::arg("rhs"))
-      .def("abs",    &IRBuilder::absOp,    py::arg("x"))
-      .def("log",    &IRBuilder::logOp,    py::arg("x"))
-      .def("sign",   &IRBuilder::signOp,   py::arg("x"))
+      .def("add",    &IRBuilder::addOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("sub",    &IRBuilder::subOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("mul",    &IRBuilder::mulOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("div",    &IRBuilder::divOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("max",    &IRBuilder::maxOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("min",    &IRBuilder::minOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("abs",    &IRBuilder::absOp,    nb::arg("x"))
+      .def("log",    &IRBuilder::logOp,    nb::arg("x"))
+      .def("sign",   &IRBuilder::signOp,   nb::arg("x"))
 
       // Windowed materialization
       .def("windowed_output", &IRBuilder::windowedOutputOp,
-            py::arg("x"), py::arg("length"))
+            nb::arg("x"), nb::arg("length"))
 
       // Back-reference + Fast windowed sum
       .def("back_ref",          &IRBuilder::backRefOp,
-            py::arg("x"), py::arg("window"))
+            nb::arg("x"), nb::arg("window"))
       .def("fast_windowed_sum", &IRBuilder::fastWindowedSumOp,
-            py::arg("x"), py::arg("window"))
+            nb::arg("x"), nb::arg("window"))
 
       // Loop
       .def("begin_for_each_back_window", &IRBuilder::beginForEachBackWindow,
-            py::arg("inputs"), py::arg("window"), py::arg("result_types"),
+            nb::arg("inputs"), nb::arg("window"), nb::arg("result_types"),
             "Open a for_each_back_window region.  Returns block args (one "
             "per loop input, type ts<elem,1>).")
       .def("end_for_each_back_window", &IRBuilder::endForEachBackWindow,
-            py::arg("yield_values"),
+            nb::arg("yield_values"),
             "Close the current for_each_back_window with a kunir.yield, "
             "returning the loop op's results.")
 
       // Reductions
-      .def("reduce_add", &IRBuilder::reduceAddOp, py::arg("x"))
-      .def("reduce_mul", &IRBuilder::reduceMulOp, py::arg("x"))
-      .def("reduce_max", &IRBuilder::reduceMaxOp, py::arg("x"))
-      .def("reduce_min", &IRBuilder::reduceMinOp, py::arg("x"))
+      .def("reduce_add", &IRBuilder::reduceAddOp, nb::arg("x"))
+      .def("reduce_mul", &IRBuilder::reduceMulOp, nb::arg("x"))
+      .def("reduce_max", &IRBuilder::reduceMaxOp, nb::arg("x"))
+      .def("reduce_min", &IRBuilder::reduceMinOp, nb::arg("x"))
 
       // Finalize / debug
       .def("to_string", &IRBuilder::toString,
diff --git a/mlir/lib/Python/IRBuilder.h b/mlir/lib/Python/IRBuilder.h
index 6fcda73..edbeb74 100644
--- a/mlir/lib/Python/IRBuilder.h
+++ b/mlir/lib/Python/IRBuilder.h
@@ -20,9 +20,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace kun_mlir_py {
-/// Register the IRBuilder + Value + Type pybind classes on `m`.
-void registerIRBuilder(::pybind11::module &m);
+/// Register the IRBuilder + Value + Type nanobind classes on `m`.
+void registerIRBuilder(::nanobind::module_ &m);
 } // namespace kun_mlir_py
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index e388ef8..edb9ca4 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -14,11 +14,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+#include <nanobind/stl/unique_ptr.h>
 
 #include "PyModule.h"     // shared MLIRContext + ModuleOp wrapper
-#include "IRBuilder.h"    // pybind class for programmatic kunir construction
+#include "IRBuilder.h"    // nanobind class for programmatic kunir construction
 #include "dlpack.h"       // vendored DLPack ABI (consumer-only)
 
 #include "KunCuda/Runtime.h"
@@ -32,7 +34,7 @@
 #include <string>
 #include <vector>
 
-namespace py = pybind11;
+namespace nb = nanobind;
 
 using kun_mlir_py::PyModule;
 
@@ -61,7 +63,7 @@ static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
 }
 
 //===----------------------------------------------------------------------===//
-// pybind glue: read a Python GPU array via DLPack → device pointer + shape
+// nanobind glue: read a Python GPU array via DLPack → device pointer + shape
 //===----------------------------------------------------------------------===//
 
 /// Result of reading one GPU array argument.  `ptr` is the device pointer
@@ -84,10 +86,10 @@ struct CudaArrayInfo {
 ///
 /// We're never "no sync" — every launch must serialise on the executor's
 /// stream — so `stream_ == nullptr` (default-stream executor) maps to 1.
-static py::object dlpackStreamArg(CUstream stream) {
+static nb::object dlpackStreamArg(CUstream stream) {
   if (stream == nullptr)
-    return py::int_(1);
-  return py::int_(reinterpret_cast<uintptr_t>(stream));
+    return nb::int_(1);
+  return nb::int_(reinterpret_cast<uintptr_t>(stream));
 }
 
 /// Throws if `(shape, stridesBytes)` doesn't describe a C-contiguous
@@ -121,15 +123,15 @@ static void requireRowMajorContiguous2D(const std::string &paramName,
 /// calls the producer's `deleter`.  We grab the fields we need and let
 /// the capsule fall out of scope at function exit — the underlying
 /// tensor stays alive because the user is still holding `obj`.
-static CudaArrayInfo readDLPack(py::handle obj, const std::string &paramName,
-                                  const py::object &streamArg) {
-  if (!py::hasattr(obj, "__dlpack__"))
+static CudaArrayInfo readDLPack(nb::handle obj, const std::string &paramName,
+                                  const nb::object &streamArg) {
+  if (!nb::hasattr(obj, "__dlpack__"))
     throw std::runtime_error(
         "'" + paramName + "' does not implement __dlpack__ — pass a CuPy "
         "ndarray, a PyTorch CUDA tensor, a JAX device array, or any other "
         "object exporting the DLPack protocol.");
 
-  py::object capsule = obj.attr("__dlpack__")(py::arg("stream") = streamArg);
+  nb::object capsule = obj.attr("__dlpack__")(nb::arg("stream") = streamArg);
   void *raw = PyCapsule_GetPointer(capsule.ptr(), "dltensor");
   if (!raw) {
     PyErr_Clear();
@@ -191,8 +193,8 @@ struct CollectedArgs {
 };
 
 static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
-                                   py::dict pyArgs,
-                                   const py::object &streamArg) {
+                                   nb::dict pyArgs,
+                                   const nb::object &streamArg) {
   // Graph inputs come first, then outputs — same as the buffer-table
   // layout the runtime expects.
   std::vector<std::string> ordered;
@@ -211,7 +213,7 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
   // is a typo'd key).
   if (pyArgs.size() > ordered.size()) {
     for (auto kv : pyArgs) {
-      std::string key = py::cast<std::string>(kv.first);
+      std::string key = nb::cast<std::string>(kv.first);
       bool known = false;
       for (auto &n : ordered) if (n == key) { known = true; break; }
       if (!known) {
@@ -231,7 +233,7 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
   for (size_t i = 0; i < ordered.size(); ++i) {
     const std::string &name = ordered[i];
 
-    py::object key = py::str(name);
+    nb::object key = nb::str(name.c_str());
     if (!pyArgs.contains(key)) {
       std::string expected;
       for (size_t j = 0; j < ordered.size(); ++j) {
@@ -264,11 +266,11 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
 /// Expected dict shape:
 ///   {"name": str, "kind": str, "inputs": [str...], "outputs": [str...]}
 /// where `kind` is one of "cs_rank_f32" / "cs_rank_f64".
-static kun_cuda::KernelMeta parseExternalKernel(py::handle obj) {
-  py::dict d = obj.cast<py::dict>();
+static kun_cuda::KernelMeta parseExternalKernel(nb::handle obj) {
+  nb::dict d = nb::cast<nb::dict>(obj);
   kun_cuda::KernelMeta km;
-  km.kernelName = d["name"].cast<std::string>();
-  std::string kind = d["kind"].cast<std::string>();
+  km.kernelName = nb::cast<std::string>(d["name"]);
+  std::string kind = nb::cast<std::string>(d["kind"]);
   if (kind == "cs_rank_f32")
     km.kind = kun_cuda::KernelKind::ExtCsRankF32;
   else if (kind == "cs_rank_f64")
@@ -277,10 +279,12 @@ static kun_cuda::KernelMeta parseExternalKernel(py::handle obj) {
     throw std::runtime_error(
         "KunMLIR.compile: unknown external kernel kind '" + kind +
         "' (supported: cs_rank_f32, cs_rank_f64)");
-  for (py::handle n : d["inputs"].cast<py::iterable>())
-    km.inputNames.push_back(n.cast<std::string>());
-  for (py::handle n : d["outputs"].cast<py::iterable>())
-    km.outputNames.push_back(n.cast<std::string>());
+  nb::iterable inputs  = nb::cast<nb::iterable>(d["inputs"]);
+  nb::iterable outputs = nb::cast<nb::iterable>(d["outputs"]);
+  for (nb::handle n : inputs)
+    km.inputNames.push_back(nb::cast<std::string>(n));
+  for (nb::handle n : outputs)
+    km.outputNames.push_back(nb::cast<std::string>(n));
   return km;
 }
 
@@ -292,7 +296,7 @@ pyCompile(PyModule &pm,
             const std::string &targetTriple,
             const std::string &targetFeatures, unsigned optLevel,
             const std::string &toolkitPath,
-            py::list externalKernels,
+            nb::list externalKernels,
             int warpsPerCta) {
   if (graphInputs.empty())
     throw std::runtime_error(
@@ -315,7 +319,7 @@ pyCompile(PyModule &pm,
   // Append external (pre-compiled, runtime-dispatched) kernels.  The
   // MLIR pipeline never saw them; they're fabricated here from the
   // descriptor list the Python frontend collected.
-  for (py::handle obj : externalKernels)
+  for (nb::handle obj : externalKernels)
     data.kernels.push_back(parseExternalKernel(obj));
 
   if (data.kernels.empty())
@@ -348,14 +352,14 @@ pyCompile(PyModule &pm,
 
 } // namespace
 
-PYBIND11_MODULE(KunMLIR, m) {
+NB_MODULE(KunMLIR, m) {
   m.doc() = "Bindings for the KunQuant MLIR compiler (kunir → PTX → CUBIN "
              "→ launch).";
 
   // Programmatic kunir construction (Value/Type opaque wrappers, IRBuilder).
   kun_mlir_py::registerIRBuilder(m);
 
-  py::class_<PyModule>(m, "ModuleOp")
+  nb::class_<PyModule>(m, "ModuleOp")
       .def("to_string", &PyModule::toString,
             "Return the textual MLIR form of the module.")
       .def("__str__",  &PyModule::toString)
@@ -363,34 +367,34 @@ PYBIND11_MODULE(KunMLIR, m) {
         return "<KunMLIR.ModuleOp>\n" + m.toString();
       });
 
-  m.def("parse", &PyModule::parse, py::arg("text"),
+  m.def("parse", &PyModule::parse, nb::arg("text"),
          "Parse an MLIR text fragment into a ModuleOp.");
 
   m.def("lower_to_ptx", &pyLowerToPtx,
-         py::arg("module"),
-         py::arg("gpu_arch")       = "sm_80",
-         py::arg("target_triple")  = "nvptx64-nvidia-cuda",
-         py::arg("target_features") = "",
-         py::arg("opt_level")      = 3u,
-         py::arg("toolkit_path")   = "",
+         nb::arg("module"),
+         nb::arg("gpu_arch")       = "sm_80",
+         nb::arg("target_triple")  = "nvptx64-nvidia-cuda",
+         nb::arg("target_features") = "",
+         nb::arg("opt_level")      = 3u,
+         nb::arg("toolkit_path")   = "",
          "Lower kunir → PTX text via the upstream `gpu-module-to-binary` "
          "pass with `format=isa`.  Debug / inspection only — the main "
          "compile path goes straight to cubin.");
 
-  py::class_<kun_cuda::Executable>(m, "Executable")
-      .def_property_readonly("input_names",   &kun_cuda::Executable::graphInputs,
+  nb::class_<kun_cuda::Executable>(m, "Executable")
+      .def_prop_ro("input_names",   &kun_cuda::Executable::graphInputs,
             "Graph-level input names — match this against the keys of the "
             "args dict you pass to launch().")
-      .def_property_readonly("output_names",  &kun_cuda::Executable::graphOutputs,
+      .def_prop_ro("output_names",  &kun_cuda::Executable::graphOutputs,
             "Graph-level output names — match this against the keys of the "
             "args dict you pass to launch().")
-      .def_property_readonly("warps_per_cta", &kun_cuda::Executable::warpsPerCta)
-      .def_property_readonly("vector_size",   &kun_cuda::Executable::vectorSize)
-      .def_property_readonly("num_kernels",
+      .def_prop_ro("warps_per_cta", &kun_cuda::Executable::warpsPerCta)
+      .def_prop_ro("vector_size",   &kun_cuda::Executable::vectorSize)
+      .def_prop_ro("num_kernels",
             [](const kun_cuda::Executable &e) {
               return e.numKernels();
             })
-      .def_property_readonly("kernel_names",
+      .def_prop_ro("kernel_names",
             [](const kun_cuda::Executable &e) {
               std::vector<std::string> r;
               r.reserve(e.data().kernels.size());
@@ -398,18 +402,18 @@ PYBIND11_MODULE(KunMLIR, m) {
                 r.push_back(km.kernelName);
               return r;
             })
-      .def_property_readonly("launch_order",  &kun_cuda::Executable::launchOrder,
+      .def_prop_ro("launch_order",  &kun_cuda::Executable::launchOrder,
             "Topo-sorted indices into kernel_names; the order kernels run "
             "on the single CUDA stream.")
-      .def_property_readonly("peak_intermediate_slots",
+      .def_prop_ro("peak_intermediate_slots",
             &kun_cuda::Executable::peakIntermediateSlots,
             "Number of intermediate buffers allocated by the runtime — "
             "shape `(time_length, num_stocks)` each.")
-      .def_property_readonly("num_buffers",   &kun_cuda::Executable::numBuffers)
-      .def_property_readonly("cubin",
+      .def_prop_ro("num_buffers",   &kun_cuda::Executable::numBuffers)
+      .def_prop_ro("cubin",
             [](const kun_cuda::Executable &e) {
               const auto &b = e.data().cubin;
-              return py::bytes(b.data(), b.size());
+              return nb::bytes(b.data(), b.size());
             });
 
   // ── Executor ────────────────────────────────────────────────────────
@@ -419,43 +423,42 @@ PYBIND11_MODULE(KunMLIR, m) {
   // a duck-typed object with a `.ptr` attribute (so passing a
   // `cupy.cuda.Stream` directly Just Works).  None / no arg → default
   // CUDA stream.
-  py::class_<kun_cuda::Executor>(m, "Executor",
+  nb::class_<kun_cuda::Executor>(m, "Executor",
         "Wraps a CUDA stream + provides `run_graph(exe, args)` (async) "
         "and `synchronize()`.  Default constructor uses the CUDA default "
         "stream; pass a cupy stream (or its `.ptr` integer) to share one "
         "with caller-managed code.")
-      .def(py::init([](py::object stream_arg) {
+      .def("__init__", [](kun_cuda::Executor *self, nb::object stream_arg) {
             uintptr_t ptr = 0;
             if (!stream_arg.is_none()) {
-              if (py::hasattr(stream_arg, "ptr"))
-                ptr = stream_arg.attr("ptr").cast<uintptr_t>();
+              if (nb::hasattr(stream_arg, "ptr"))
+                ptr = nb::cast<uintptr_t>(stream_arg.attr("ptr"));
               else
-                ptr = stream_arg.cast<uintptr_t>();
+                ptr = nb::cast<uintptr_t>(stream_arg);
             }
-            return std::make_unique<kun_cuda::Executor>(
-                reinterpret_cast<CUstream>(ptr));
-          }),
-          py::arg("stream") = py::none(),
+            new (self) kun_cuda::Executor(reinterpret_cast<CUstream>(ptr));
+          },
+          nb::arg("stream") = nb::none(),
           "Build an Executor.  `stream=None` → default CUDA stream; "
           "otherwise expects either an int (uintptr_t handle) or a "
           "cupy.cuda.Stream-like object exposing `.ptr`.")
-      .def_property_readonly("stream",
+      .def_prop_ro("stream",
           [](const kun_cuda::Executor &e) -> uintptr_t {
             return reinterpret_cast<uintptr_t>(e.stream());
           },
           "Raw stream handle as an int (0 ↔ CUDA default stream).")
       .def("runGraph",
           [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
-              py::dict pyArgs) {
+              nb::dict pyArgs) {
             // Thread the executor's stream into __dlpack__(stream=…)
             // so producers (CuPy / PyTorch / JAX / TF) can insert the
             // cross-stream sync needed for data-readiness on our
             // launch stream.
-            py::object streamArg = dlpackStreamArg(e.stream());
+            nb::object streamArg = dlpackStreamArg(e.stream());
             auto c = collectArgs(exe, pyArgs, streamArg);
             e.runGraph(exe, c.timeLength, c.numStocks, c.args);
           },
-          py::arg("exe"), py::arg("args"),
+          nb::arg("exe"), nb::arg("args"),
           "Queue every kernel in `exe` onto this executor's stream.\n"
           "**Asynchronous** — call `.synchronize()` (or otherwise wait\n"
           "on the stream) before reading results back to host.\n"
@@ -471,16 +474,16 @@ PYBIND11_MODULE(KunMLIR, m) {
           "Block until every kernel queued on this stream completes.");
 
   m.def("compile", &pyCompile,
-         py::arg("module"),
-         py::arg("graph_inputs"),
-         py::arg("graph_outputs"),
-         py::arg("gpu_arch")       = "sm_80",
-         py::arg("target_triple")  = "nvptx64-nvidia-cuda",
-         py::arg("target_features") = "",
-         py::arg("opt_level")      = 3u,
-         py::arg("toolkit_path")   = "",
-         py::arg("external_kernels") = py::list(),
-         py::arg("warps_per_cta")    = 0,
+         nb::arg("module"),
+         nb::arg("graph_inputs"),
+         nb::arg("graph_outputs"),
+         nb::arg("gpu_arch")       = "sm_80",
+         nb::arg("target_triple")  = "nvptx64-nvidia-cuda",
+         nb::arg("target_features") = "",
+         nb::arg("opt_level")      = 3u,
+         nb::arg("toolkit_path")   = "",
+         nb::arg("external_kernels") = nb::list(),
+         nb::arg("warps_per_cta")    = 0,
          "Compile a kunir module all the way to a loaded Executable.\n"
          "\n"
          "Pipeline: kunir → LLVM dialect → upstream `gpu-module-to-binary`\n"

From b8c04b80a563da0b2e50d483000f3fb702e619d3 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 13 May 2026 01:32:22 -0700
Subject: [PATCH 20/59] time slice (mlir part)

---
 KunQuant/jit/cuda.py                         |  11 +-
 KunQuant/passes/CodegenMLIR.py               |   7 +-
 mlir/include/KunGpu/KunGpuOps.td             |  42 ++++
 mlir/include/KunGpu/KunGpuUtils.h            |  11 +
 mlir/include/KunIr/KunIrOps.td               |  15 +-
 mlir/lib/KunGpu/KunGpuToLLVM.cpp             | 206 +++++++++++++++++--
 mlir/lib/KunIr/KunIrOps.cpp                  |  20 +-
 mlir/lib/KunIr/KunIrToKunGpu.cpp             |  13 +-
 mlir/lib/Python/IRBuilder.cpp                |  13 +-
 mlir/test/kungpu/basic.mlir                  |  10 +-
 mlir/test/kungpu/kunir_to_llvm_pipeline.mlir |   9 +-
 mlir/test/kungpu/lower_to_llvm.mlir          | 140 +++++++++++--
 mlir/test/kungpu/memory_planning.mlir        |   6 +-
 mlir/test/kunir/basic.mlir                   |  18 +-
 mlir/test/kunir/func.mlir                    |  14 +-
 mlir/test/kunir/lower_to_kungpu.mlir         |  21 +-
 mlir/test/python/test_kun_mlir.py            |   2 +-
 mlir/test/python/test_multi_kernel.py        |   4 +-
 mlir/test/python/test_windowed_temp.py       |   2 +-
 19 files changed, 479 insertions(+), 85 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 3076d3d..958d706 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -33,6 +33,7 @@
 from KunQuant.Driver import optimize, post_optimize
 from KunQuant.Op import Input, Output
 from KunQuant.passes import do_partition
+from KunQuant.passes.InferWindow import infer_window
 from KunQuant.Stage import Function
 from KunQuant.passes.CodegenMLIR import TargetSpec, translate_function
 
@@ -205,7 +206,15 @@ def _translate_partitions(impl, cfg: CudaCompilerConfig):
     dtype = _to_dtype_token(cfg.dtype)
     externals = []
     for sub in impl:
-        ext = translate_function(sub, target, ir, dtype=dtype)
+        # Per-partition warmup: max windowed-chain depth from any input
+        # to any output of THIS partition.  Earlier partitions have already
+        # written their (post-warmup) values into the shared device buffers
+        # by the time this kernel runs, so we don't accumulate their
+        # unreliable counts here.  infer_window walks back to Input ops
+        # of the partition; cross-partition deps stop at those Inputs.
+        per_kernel_unreliable = max(infer_window(sub).values(), default=0)
+        ext = translate_function(sub, target, ir, dtype=dtype,
+                                   unreliable_count=per_kernel_unreliable)
         if ext is not None:
             externals.append(ext)
     return ir.finish(), externals
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 7e14a90..891fc8f 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -182,7 +182,7 @@ def _maybe_external_partition(f: Function, dtype: str):
 
 
 def translate_function(f: Function, target: TargetSpec, ir,
-                        dtype: str = "f32"):
+                        dtype: str = "f32", unreliable_count: int = 0):
     """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
 
     If `f` is an externally-dispatched partition (e.g. a single cs_rank
@@ -190,6 +190,10 @@ def translate_function(f: Function, target: TargetSpec, ir,
     the IRBuilder and return its descriptor dict so the caller can pass
     it to KunMLIR.compile()'s `external_kernels=` list.  Otherwise
     return `None` after emitting a kunir.func.
+
+    `unreliable_count` is the partition-local warmup depth — the caller
+    (`KunQuant.jit.cuda`) computes it via `infer_window(f)` on this
+    post-partition Function and feeds it in.
     """
     ext = _maybe_external_partition(f, dtype)
     if ext is not None:
@@ -223,6 +227,7 @@ def translate_function(f: Function, target: TargetSpec, ir,
         output_names=out_names,
         occupancy=target.occupancy, warps_per_cta=target.warps_per_cta,
         smem_size=target.smem_size, vector_size=target.vector_size,
+        unreliable_count=unreliable_count,
         result_types=[ts_1] * len(outputs),
     )
 
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
index 7310d9a..f81ddab 100644
--- a/mlir/include/KunGpu/KunGpuOps.td
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -39,6 +39,48 @@ def KunGpu_TimeLengthOp : KunGpu_Op<"time_length", [Pure]> {
   let assemblyFormat = "attr-dict";
 }
 
+//===----------------------------------------------------------------------===//
+// Time-chunk loop bounds
+//
+// The outer time loop is split into multiple chunks across the y dimension
+// of the launch grid (one CTA per (stock_tile, time_chunk)).  These ops
+// encode the per-chunk loop bounds; lowering reads gpu.block_id y for
+// chunk_idx.  When num_chunks == 1 callers should set chunk_size =
+// time_length so only chunk 0 (the full range) runs.
+//===----------------------------------------------------------------------===//
+
+def KunGpu_TimeLbOp : KunGpu_Op<"time_lb", [Pure]> {
+  let summary = "Time-loop lower bound (inclusive) for the current chunk";
+  let description = [{
+    Returns:
+      chunk_idx == 0  →  0
+      otherwise       →  chunk_idx * chunk_size - warmup
+    The warmup overlap lets chunks ≥ 1 prime their windowed rolling state
+    over the trailing `warmup` steps of the previous chunk before they
+    start writing reliable outputs.  `chunk_idx` is `gpu.block_id y`.
+
+    No operands — chunk_size / warmup are runtime scalars that the
+    kungpu-to-llvm pass prepends to the gpu.func signature; lowering
+    reads them from fixed arg positions, mirroring `kungpu.time_length`.
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+def KunGpu_TimeUbOp : KunGpu_Op<"time_ub", [Pure]> {
+  let summary = "Time-loop upper bound (exclusive) for the current chunk";
+  let description = [{
+    Returns `min((chunk_idx + 1) * chunk_size, time_length)`.  `chunk_idx`
+    is `gpu.block_id y`.  The last chunk gets clipped to `time_length` so
+    `time_length` need not be a multiple of `chunk_size`.
+
+    No operands — see TimeLbOp; both chunk_size and time_length are read
+    from gpu.func args at lowering time.
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // Time-series memory ops
 //
diff --git a/mlir/include/KunGpu/KunGpuUtils.h b/mlir/include/KunGpu/KunGpuUtils.h
index 46eb6c8..1f0c270 100644
--- a/mlir/include/KunGpu/KunGpuUtils.h
+++ b/mlir/include/KunGpu/KunGpuUtils.h
@@ -22,6 +22,7 @@ namespace kungpu {
 constexpr llvm::StringLiteral kFuncTargetSpecAttr  = "kungpu.target_spec";
 constexpr llvm::StringLiteral kFuncInputNamesAttr  = "kungpu.input_names";
 constexpr llvm::StringLiteral kFuncOutputNamesAttr = "kungpu.output_names";
+constexpr llvm::StringLiteral kFuncUnreliableCountAttr = "kungpu.unreliable_count";
 
 inline ::kunir::TargetSpecAttr getFuncTargetSpec(::mlir::Operation *fn) {
   return fn->getAttrOfType<::kunir::TargetSpecAttr>(kFuncTargetSpecAttr);
@@ -47,4 +48,14 @@ inline void setFuncOutputNames(::mlir::Operation *fn,
   fn->setAttr(kFuncOutputNamesAttr, names);
 }
 
+inline int64_t getFuncUnreliableCount(::mlir::Operation *fn) {
+  auto attr = fn->getAttrOfType<::mlir::IntegerAttr>(kFuncUnreliableCountAttr);
+  return attr ? attr.getInt() : 0;
+}
+inline void setFuncUnreliableCount(::mlir::Operation *fn, int64_t v) {
+  fn->setAttr(kFuncUnreliableCountAttr,
+              ::mlir::IntegerAttr::get(
+                  ::mlir::IntegerType::get(fn->getContext(), 64), v));
+}
+
 } // namespace kungpu
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index cedfec5..8829abe 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -271,6 +271,7 @@ def KunIr_FastWindowedSumOp : KunIr_Op<"fast_windowed_sum", [Pure]> {
 //       inputs {%arg0 = "name0", ...}
 //       outputs {"out0", ...}            // non-void: one string per result
 //       target {occupancy = V, warps_per_cta = V, smem_size = V}
+//       unreliable_count = N
 //       -> (result_type0, ...) {
 //     body
 //   }
@@ -278,13 +279,19 @@ def KunIr_FastWindowedSumOp : KunIr_Op<"fast_windowed_sum", [Pure]> {
 //   kunir.func @name(%arg0: type0, %arg1: type1)
 //       inputs {%arg0 = "input0"}
 //       outputs {%arg1 = "output0"}      // void: %argN = "name" form
-//       target {...} {
+//       target {...}
+//       unreliable_count = N {
 //     body
 //   }
 //
 // Constraints (void case):   len(inputs) + len(outputs) == len(block_args)
 // Constraints (non-void):    len(inputs) == len(block_args),
 //                            len(outputs) == num_results
+//
+// `unreliable_count` is the per-partition warmup depth: max over all
+// outputs of (sum of windowed op windows from any input to that output).
+// Required because callers (runtime) need it to size the time-axis
+// chunk grid and to thread the kernel's warmup arg.
 //===----------------------------------------------------------------------===//
 
 def KunIr_FuncOp : KunIr_Op<"func", [
@@ -298,7 +305,8 @@ def KunIr_FuncOp : KunIr_Op<"func", [
     TypeAttr:$function_type,
     ArrayAttr:$input_names,
     ArrayAttr:$output_names,
-    KunIr_TargetSpecAttr:$target_spec
+    KunIr_TargetSpecAttr:$target_spec,
+    I64Attr:$unreliable_count
   );
   let regions = (region SizedRegion<1>:$body);
   let hasCustomAssemblyFormat = 1;
@@ -309,7 +317,8 @@ def KunIr_FuncOp : KunIr_Op<"func", [
                    "::mlir::FunctionType":$type,
                    "::mlir::ArrayAttr":$inputNames,
                    "::mlir::ArrayAttr":$outputNames,
-                   "::kunir::TargetSpecAttr":$targetSpec)>
+                   "::kunir::TargetSpecAttr":$targetSpec,
+                   "int64_t":$unreliableCount)>
   ];
   let extraClassDeclaration = [{
     // getFunctionType() is generated by tablegen and returns mlir::Type.
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 88cd177..24ed311 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -85,6 +85,32 @@ struct WTDesc {
 };
 using WTDescMap = llvm::DenseMap<Value, WTDesc>;
 
+// Per-function cache for chunk-related values shared across multiple
+// output-store rewrites.  Each gpu.func builds (at most) one mask
+// index_cast and one write_start SSA value; subsequent ts.put rewrites
+// against an output arg reuse them, so we don't lean on a downstream
+// CSE pass.
+//
+// Both cached values are index-typed (not i32) because they're used as
+// scf.for / arith.cmpi operands against the loop induction variable
+// which is index-typed.  The runtime scalar args (mask, chunk_size,
+// warmup) are i32; the helpers below insert the i32 → index cast once
+// at function entry.
+//
+//   mask         : index, cast once from arg[2] (i32).  Used to shift
+//                  output indices: out[t - mask, sid].
+//   writeStart   : (block_id y == 0) ? mask : block_id y * chunk_size.
+//                  Output stores below this time-index are suppressed —
+//                  they fall in the warmup-overlap region.
+//
+// Both are emitted at the very top of the function entry block so they
+// dominate every store site, regardless of how deeply nested.
+struct ChunkContext {
+  Value mask;
+  Value writeStart;
+};
+using ChunkCtxMap = llvm::DenseMap<Operation *, ChunkContext>;
+
 //===----------------------------------------------------------------------===//
 // Helper: stock_id = blockIdx.x * blockDim.x + threadIdx.x  (index-typed)
 // Defined here so phase 1 (`convertFuncSignature` below) can reuse it
@@ -129,7 +155,13 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
   }
 
   FunctionType oldFT = fn.getFunctionTypeTyped();
-  SmallVector<Type> newArgTypes = {i32Ty, i32Ty};
+  // Prepend (time_length, num_stocks, mask, chunk_size, warmup) — all
+  // i32.  time_length / num_stocks shape the linear gmem indexing;
+  // mask / chunk_size / warmup feed the multi-chunk time-axis path
+  // (kungpu.time_lb / time_ub / output-store gating).  64-bit math is
+  // slow on GPUs, so we keep them as i32 and cast to index only at the
+  // few places that need it.
+  SmallVector<Type> newArgTypes = {i32Ty, i32Ty, i32Ty, i32Ty, i32Ty};
   for (Type t : oldFT.getInputs())
     newArgTypes.push_back(t);
 
@@ -144,6 +176,7 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
   setFuncTargetSpec (newFunc, fn.getTargetSpecAttr());
   setFuncInputNames (newFunc, fn.getInputNames());
   setFuncOutputNames(newFunc, fn.getOutputNames());
+  setFuncUnreliableCount(newFunc, fn.getUnreliableCount());
 
   // gpu.func's auto-created entry block is replaced with the kunir.func
   // body.  Block-arg types initially still match the kunir.func signature;
@@ -151,8 +184,11 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
   // gpu.func type (ts → !llvm.ptr).
   newFunc.getBody().takeBody(fn.getBody());
   Block &entry = newFunc.getBody().front();
-  entry.insertArgument(0u, i32Ty, loc);
-  entry.insertArgument(1u, i32Ty, loc);
+  entry.insertArgument(0u, i32Ty, loc); // time_length
+  entry.insertArgument(1u, i32Ty, loc); // num_stocks
+  entry.insertArgument(2u, i32Ty, loc); // mask
+  entry.insertArgument(3u, i32Ty, loc); // chunk_size
+  entry.insertArgument(4u, i32Ty, loc); // warmup
 
   SmallVector<kunir::ReturnOp> returns;
   newFunc.walk([&](kunir::ReturnOp r) { returns.push_back(r); });
@@ -253,6 +289,121 @@ struct TimeLengthPattern : OpConversionPattern<TimeLengthOp> {
   }
 };
 
+// time_lb = (block_id y == 0) ? 0 : block_id y * chunk_size - warmup
+// All arithmetic happens in i32 (64-bit ops are slow on GPU); a single
+// index_cast at the end produces the index-typed scf.for bound.
+// chunk_size / warmup come from gpu.func args[3] / args[4]; the op has
+// no operands at the kungpu level.
+struct TimeLbPattern : OpConversionPattern<TimeLbOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(TimeLbOp op, OpAdaptor /*adaptor*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto i32Ty = rewriter.getI32Type();
+    auto idxTy = rewriter.getIndexType();
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    Value chunkSize = fn.getBody().front().getArgument(3);
+    Value warmup    = fn.getBody().front().getArgument(4);
+    Value cyIdx = rewriter.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::y);
+    Value cy = rewriter.create<arith::IndexCastOp>(loc, i32Ty, cyIdx);
+    Value c0 = rewriter.create<arith::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value isFirst = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::eq, cy, c0);
+    Value off = rewriter.create<arith::MulIOp>(loc, cy, chunkSize);
+    Value offMinusW = rewriter.create<arith::SubIOp>(loc, off, warmup);
+    Value lbI32 = rewriter.create<arith::SelectOp>(loc, isFirst, c0, offMinusW);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, idxTy, lbI32);
+    return success();
+  }
+};
+
+// time_ub = min((block_id y + 1) * chunk_size, time_length)
+// chunk_size / time_length come from gpu.func args[3] / args[0]; both
+// are i32 so the math stays in i32 with one final cast to index.
+struct TimeUbPattern : OpConversionPattern<TimeUbOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(TimeUbOp op, OpAdaptor /*adaptor*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto i32Ty = rewriter.getI32Type();
+    auto idxTy = rewriter.getIndexType();
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    Value timeLen   = fn.getBody().front().getArgument(0);
+    Value chunkSize = fn.getBody().front().getArgument(3);
+    Value cyIdx = rewriter.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::y);
+    Value cy = rewriter.create<arith::IndexCastOp>(loc, i32Ty, cyIdx);
+    Value c1 = rewriter.create<arith::ConstantOp>(
+        loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value next = rewriter.create<arith::AddIOp>(loc, cy, c1);
+    Value end = rewriter.create<arith::MulIOp>(loc, next, chunkSize);
+    Value ubI32 = rewriter.create<arith::MinUIOp>(loc, end, timeLen);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, idxTy, ubI32);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Chunk-context lazy helpers.  See ChunkContext above.
+//
+// mask / chunk_size / warmup come in as i32 func args (positions 2 / 3 /
+// 4 after time_length / num_stocks).  We cast mask to index once per
+// function and cache the result, then build writeStart from it.  Both
+// emissions land at the very top of the function entry block so the
+// resulting SSA values dominate every store-site inside the kernel.
+//===----------------------------------------------------------------------===//
+
+static Value getOrCreateMask(Operation *op, ChunkCtxMap &map,
+                              ConversionPatternRewriter &rewriter) {
+  auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+  ChunkContext &ctx = map[fn.getOperation()];
+  if (ctx.mask) return ctx.mask;
+  // arg layout: (i32 time_length, i32 num_stocks, i32 mask, i32 chunk_size,
+  //              i32 warmup, ts...)
+  Value maskI32 = fn.getBody().front().getArgument(2);
+  Location loc = fn.getLoc();
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointToStart(&fn.getBody().front());
+  ctx.mask = rewriter.create<arith::IndexCastOp>(loc, rewriter.getIndexType(),
+                                                    maskI32);
+  return ctx.mask;
+}
+
+static Value getOrCreateWriteStart(Operation *op, ChunkCtxMap &map,
+                                     ConversionPatternRewriter &rewriter) {
+  auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+  ChunkContext &ctx = map[fn.getOperation()];
+  if (ctx.writeStart) return ctx.writeStart;
+
+  // Compute in i32 (cheap on GPU) then cast once to index, since the
+  // result is compared against the scf.for IV (index-typed).  We read
+  // the i32 mask and chunk_size args directly — not the cached index
+  // mask — so the mask helper and this helper don't depend on each
+  // other and either order is fine.
+  Block &entry = fn.getBody().front();
+  Value maskI32      = entry.getArgument(2);
+  Value chunkSizeI32 = entry.getArgument(3);
+  Location loc = fn.getLoc();
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointToStart(&entry);
+  auto i32Ty = rewriter.getI32Type();
+  auto idxTy = rewriter.getIndexType();
+  Value cyIdx = rewriter.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::y);
+  Value cy = rewriter.create<arith::IndexCastOp>(loc, i32Ty, cyIdx);
+  Value c0 = rewriter.create<arith::ConstantOp>(
+      loc, i32Ty, rewriter.getI32IntegerAttr(0));
+  Value isFirst = rewriter.create<arith::CmpIOp>(
+      loc, arith::CmpIPredicate::eq, cy, c0);
+  Value off = rewriter.create<arith::MulIOp>(loc, cy, chunkSizeI32);
+  Value wsI32 = rewriter.create<arith::SelectOp>(loc, isFirst, maskI32, off);
+  ctx.writeStart = rewriter.create<arith::IndexCastOp>(loc, idxTy, wsI32);
+  return ctx.writeStart;
+}
+
 struct StockIdPattern : OpConversionPattern<StockIdOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -439,9 +590,11 @@ struct TsGetPattern : OpConversionPattern<TsGetOp> {
 
 struct TsPutPattern : OpConversionPattern<TsPutOp> {
   WTDescMap &descMap;
+  ChunkCtxMap &chunkCtx;
 
-  TsPutPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m)
-      : OpConversionPattern(tc, ctx), descMap(m) {}
+  TsPutPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m,
+                ChunkCtxMap &c)
+      : OpConversionPattern(tc, ctx), descMap(m), chunkCtx(c) {}
 
   LogicalResult
   matchAndRewrite(TsPutOp op, OpAdaptor adaptor,
@@ -486,13 +639,33 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
       rewriter.create<LLVM::StoreOp>(loc, newPos, desc.posPtr);
       rewriter.eraseOp(op);
     } else {
-      // ── global ts: write at current time ──────────────────────────
-      Value timeIdx = getCurrentTimeIdx(op);
-      Value gep = gmemGEPWithOffset(rewriter, loc, elemTy, ptrTy, tsPtr,
-                                     timeIdx, /*offsetIdx=*/Value(),
-                                     getNumStocksI64(rewriter, op, loc),
+      // ── global ts: write at current time, gated by per-chunk write_start,
+      //    output index shifted by `mask` so the output array's time dim is
+      //    `time_length - mask`.
+      //
+      //   if (t >= write_start)
+      //     out[t - mask, sid] = v
+      //
+      // The `t >= write_start` comparison is uniform across the CTA (all
+      // threads share the same scf.for IV), so the lowered branch is a
+      // single uniform predicate — no warp divergence at chunk boundaries.
+      Value timeIdx    = getCurrentTimeIdx(op);
+      Value writeStart = getOrCreateWriteStart(op, chunkCtx, rewriter);
+      Value mask       = getOrCreateMask(op, chunkCtx, rewriter);
+
+      Value doWrite = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, timeIdx, writeStart);
+      auto ifOp = rewriter.create<scf::IfOp>(
+          loc, /*resultTypes=*/TypeRange{}, doWrite,
+          /*withElseRegion=*/false);
+
+      OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
+      Value tOut = ib.create<arith::SubIOp>(loc, timeIdx, mask);
+      Value gep = gmemGEPWithOffset(ib, loc, elemTy, ptrTy, tsPtr,
+                                     tOut, /*offsetIdx=*/Value(),
+                                     getNumStocksI64(ib, op, loc),
                                      idxTy, i64Ty);
-      rewriter.create<LLVM::StoreOp>(loc, v, gep);
+      ib.create<LLVM::StoreOp>(loc, v, gep);
       rewriter.eraseOp(op);
     }
     return success();
@@ -722,7 +895,8 @@ struct ConvertKunGpuToLLVMPass
                            LLVM::LLVMDialect, gpu::GPUDialect>();
     target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
     target.addIllegalOp<WindowedTempOp, TsGetOp, TsPutOp,
-                        TimeLengthOp, StockIdOp, BlockStockCountOp>();
+                        TimeLengthOp, TimeLbOp, TimeUbOp,
+                        StockIdOp, BlockStockCountOp>();
     target.addIllegalOp<kunir::FastWindowedSumOp>();
     // gpu.func is legal only after its signature has been converted from
     // (...kunir.ts) to (...!llvm.ptr) by the FunctionOpInterface pattern
@@ -734,15 +908,17 @@ struct ConvertKunGpuToLLVMPass
     // gpu.return is void in our IR — always legal.
 
     WTDescMap descMap;
+    ChunkCtxMap chunkCtx;
     int smemCounter = 0;
 
     RewritePatternSet patterns(ctx);
     populateFunctionOpInterfaceTypeConversionPattern<gpu::GPUFuncOp>(
         patterns, typeConv);
-    patterns.add<TimeLengthPattern, StockIdPattern, BlockStockCountPattern>(
-        typeConv, ctx);
+    patterns.add<TimeLengthPattern, TimeLbPattern, TimeUbPattern,
+                  StockIdPattern, BlockStockCountPattern>(typeConv, ctx);
     patterns.add<WindowedTempPattern>(typeConv, ctx, descMap, smemCounter);
-    patterns.add<TsGetPattern, TsPutPattern>(typeConv, ctx, descMap);
+    patterns.add<TsGetPattern>(typeConv, ctx, descMap);
+    patterns.add<TsPutPattern>(typeConv, ctx, descMap, chunkCtx);
     patterns.add<FastWindowedSumPattern>(typeConv, ctx);
 
     if (failed(applyPartialConversion(module, target, std::move(patterns))))
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index fb5d155..771184d 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -429,12 +429,14 @@ Value ReduceMinOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value ele
 void FuncOp::build(OpBuilder &b, OperationState &result,
                    StringRef name, FunctionType type,
                    ArrayAttr inputNames, ArrayAttr outputNames,
-                   TargetSpecAttr targetSpec) {
+                   TargetSpecAttr targetSpec, int64_t unreliableCount) {
   result.addAttribute(getSymNameAttrName(result.name), b.getStringAttr(name));
   result.addAttribute(getFunctionTypeAttrName(result.name), TypeAttr::get(type));
   result.addAttribute(getInputNamesAttrName(result.name), inputNames);
   result.addAttribute(getOutputNamesAttrName(result.name), outputNames);
   result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
+  result.addAttribute(getUnreliableCountAttrName(result.name),
+                        b.getI64IntegerAttr(unreliableCount));
   Region *body = result.addRegion();
   Block *block = new Block;
   for (Type inputType : type.getInputs())
@@ -497,6 +499,11 @@ LogicalResult FuncOp::verify() {
     return emitOpError("target smem_size must be non-negative, got ")
            << ts.getSmemSize();
 
+  // Validate unreliable_count
+  if (getUnreliableCount() < 0)
+    return emitOpError("unreliable_count must be non-negative, got ")
+           << getUnreliableCount();
+
   return success();
 }
 
@@ -556,6 +563,14 @@ ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
   if (!targetSpec) return failure();
   result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
 
+  // unreliable_count = N
+  if (parser.parseKeyword("unreliable_count") || parser.parseEqual())
+    return failure();
+  int64_t unrelVal = 0;
+  if (parser.parseInteger(unrelVal)) return failure();
+  result.addAttribute(getUnreliableCountAttrName(result.name),
+                       b.getI64IntegerAttr(unrelVal));
+
   // -> (result_type, ...) or -> result_type  [optional]
   SmallVector<Type> resultTypes;
   if (parser.parseOptionalArrow().succeeded()) {
@@ -635,6 +650,9 @@ void FuncOp::print(OpAsmPrinter &p) {
   p << " target ";
   getTargetSpec().print(p);
 
+  // unreliable_count = N
+  p << " unreliable_count = " << getUnreliableCount();
+
   // -> result types (non-void)
   auto resultTypes = ft.getResults();
   if (!resultTypes.empty()) {
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 359f7ce..0577059 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -149,6 +149,9 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
 
   // ------------------------------------------------------------------
   // 1. Extend function signature: ts return types → extra output params.
+  //    Runtime-scalar args (time_length, num_stocks, mask, chunk_size,
+  //    warmup) are added later by convert-kungpu-to-llvm's
+  //    convertFuncSignature, not here.
   // ------------------------------------------------------------------
   FunctionType oldFT = funcOp.getFunctionTypeTyped();
   SmallVector<Type> newArgTys(oldFT.getInputs());
@@ -191,7 +194,13 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   OpBuilder b(ctx);
   b.setInsertionPoint(origOps.front());
 
-  Value timeLen = b.create<TimeLengthOp>(loc, b.getIndexType());
+  // Per-chunk bounds.  Both ops are operandless — chunk_size / warmup /
+  // time_length all live as kernel scalar args added by
+  // convert-kungpu-to-llvm and are read at lowering time.  When the
+  // caller's launcher uses num_chunks = 1 it sets chunk_size =
+  // time_length so chunk 0 covers the full range.
+  Value lb = b.create<TimeLbOp>(loc, b.getIndexType());
+  Value ub = b.create<TimeUbOp>(loc, b.getIndexType());
   Value c0 = b.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = b.create<arith::ConstantIndexOp>(loc, 1);
   // Outer-loop ts.get/put always reference the current time step, i.e.
@@ -199,7 +208,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   // every use inside the loop body.
   Value zeroOffsetI32 = b.create<arith::ConstantOp>(
       loc, b.getI32Type(), b.getI32IntegerAttr(0));
-  auto outerFor = b.create<scf::ForOp>(loc, c0, timeLen, c1);
+  auto outerFor = b.create<scf::ForOp>(loc, lb, ub, c1);
 
   // Erase the implicit empty scf.yield (no iter_args → zero-operand yield).
   outerFor.getBody()->back().erase();
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 9dc62e1..d1970b4 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -72,6 +72,7 @@ class IRBuilder {
               std::vector<std::string> outputNames,
               int64_t occupancy, int64_t warpsPerCta,
               int64_t smemSize, int64_t vectorSize,
+              int64_t unreliableCount,
               std::vector<Type> resultTypes) {
     if (curFunc_)
       throw std::runtime_error(
@@ -85,6 +86,10 @@ class IRBuilder {
       throw std::runtime_error(
           "IRBuilder.begin_func: result_types and output_names must have "
           "the same length (non-void form: outputs become result types)");
+    if (unreliableCount < 0)
+      throw std::runtime_error(
+          "IRBuilder.begin_func: unreliable_count must be non-negative, got "
+          + std::to_string(unreliableCount));
 
     // Restore insertion point to the gpu.module body before starting a
     // new function (in case end_func left us at module scope already).
@@ -104,7 +109,8 @@ class IRBuilder {
                                                 smemSize, vectorSize);
 
     curFunc_ = b_.create<kunir::FuncOp>(loc, name, funcType, inNamesAttr,
-                                          outNamesAttr, target);
+                                          outNamesAttr, target,
+                                          unreliableCount);
     Block &entry = curFunc_.getBodyBlock();
     b_.setInsertionPointToStart(&entry);
 
@@ -298,8 +304,11 @@ void registerIRBuilder(nb::module_ &m) {
             nb::arg("output_names"),
             nb::arg("occupancy"), nb::arg("warps_per_cta"),
             nb::arg("smem_size"), nb::arg("vector_size"),
+            nb::arg("unreliable_count"),
             nb::arg("result_types"),
-            "Open a new kunir.func.  Returns its argument Values.")
+            "Open a new kunir.func.  Returns its argument Values.  "
+            "`unreliable_count` is the partition-local warmup depth "
+            "(max windowed-chain depth from any input to any output).")
       .def("end_func", &IRBuilder::endFunc, nb::arg("return_values"),
             "Close the current kunir.func with a kunir.return.")
 
diff --git a/mlir/test/kungpu/basic.mlir b/mlir/test/kungpu/basic.mlir
index 1099641..128fb3c 100644
--- a/mlir/test/kungpu/basic.mlir
+++ b/mlir/test/kungpu/basic.mlir
@@ -4,7 +4,7 @@
 // CHECK-LABEL: kunir.func @test_stock_id
 kunir.func @test_stock_id()
     inputs {} outputs {"id"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> index {
   // CHECK: kungpu.stock_id
   %id = kungpu.stock_id
@@ -14,7 +14,7 @@ kunir.func @test_stock_id()
 // CHECK-LABEL: kunir.func @test_block_stock_count
 kunir.func @test_block_stock_count()
     inputs {} outputs {"n"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> index {
   // CHECK: kungpu.block_stock_count
   %n = kungpu.block_stock_count
@@ -24,7 +24,7 @@ kunir.func @test_block_stock_count()
 // CHECK-LABEL: kunir.func @test_time_length
 kunir.func @test_time_length()
     inputs {} outputs {"len"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> index {
   // CHECK: kungpu.time_length
   %len = kungpu.time_length
@@ -35,7 +35,7 @@ kunir.func @test_time_length()
 kunir.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32, 1>)
     inputs {%ts_in = "ts_in"}
     outputs {%ts_out = "ts_out"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0 {
   %off = arith.constant 0 : i32
   // CHECK: kungpu.ts.get
   // CHECK-SAME: <f32, inf> -> f32
@@ -48,7 +48,7 @@ kunir.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32,
 // CHECK-LABEL: kunir.func @test_windowed_temp
 kunir.func @test_windowed_temp()
     inputs {} outputs {"v"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> f32 {
   %off = arith.constant 0 : i32
   // CHECK: %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
diff --git a/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir b/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
index 9620fe3..6837f69 100644
--- a/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
+++ b/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
@@ -17,11 +17,14 @@
 
 // CHECK:       gpu.module @kungpu_kernels
 
-// llvm.func with the (i32 time_len, i32 num_stocks, ptr...) signature,
-// tagged as a kernel by convert-gpu-to-nvvm.
+// llvm.func with the (i32 time_len, i32 num_stocks, i32 mask, i32 chunk_size,
+// i32 warmup, ptr...) signature, tagged as a kernel by convert-gpu-to-nvvm.
 // CHECK-LABEL: llvm.func @test_addsum
 // CHECK-SAME:    i32
 // CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
 // CHECK-SAME:    !llvm.ptr
 // CHECK-SAME:    !llvm.ptr
 // CHECK-SAME:    !llvm.ptr
@@ -52,7 +55,7 @@ gpu.module @kungpu_kernels {
   kunir.func @test_addsum(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
       inputs {%a = "a", %b = "b"}
       outputs {"sum"}
-      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
       -> !kunir.ts<f32, 1> {
     %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
     kunir.return %s : !kunir.ts<f32, 1>
diff --git a/mlir/test/kungpu/lower_to_llvm.mlir b/mlir/test/kungpu/lower_to_llvm.mlir
index 0107ec4..0736709 100644
--- a/mlir/test/kungpu/lower_to_llvm.mlir
+++ b/mlir/test/kungpu/lower_to_llvm.mlir
@@ -15,6 +15,77 @@ gpu.module @kungpu_kernels {
 // CHECK-SAME:    !llvm.array<{{[0-9]+}} x f32>
 
 
+// =====================================================================
+// Case 0 — time_lb / time_ub lowering in isolation.
+//
+// Both ops do their arithmetic in i32 (64-bit ops are slow on GPU);
+// only the final scf.for bound is cast back to index.
+//
+//   time_lb = (cy_i32 == 0) ? 0 : cy_i32 * chunk_size - warmup
+//   time_ub = min((cy_i32 + 1) * chunk_size, time_length)
+//
+// where cy = gpu.block_id y.  This function lowers to nothing but the
+// signature, the two bound computations, and an empty scf.for (no body
+// ops survive the conversion — kungpu.time_length / time_lb / time_ub
+// are illegal in the output IR).
+// =====================================================================
+//
+// CHECK-LABEL: gpu.func @test_time_bounds(
+// CHECK-SAME:    %[[TL_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[NS:[^:]+]]: i32,
+// CHECK-SAME:    %[[MASK_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[CSZ_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[WUP_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[IN:[^:]+]]: !llvm.ptr,
+// CHECK-SAME:    %[[OUT:[^:]+]]: !llvm.ptr
+// CHECK-SAME:    kernel
+//
+// ── time_lb lowering ───────────────────────────────────────────────────
+// chunk_size / warmup are read from gpu.func args directly (op is
+// operandless at this level).
+//   lb_i32 = select(cy_i32 == 0, 0, cy_i32 * chunk_size - warmup)
+//   lb     = index_cast lb_i32
+// CHECK:         %[[CY_LB_IDX:.*]] = gpu.block_id y
+// CHECK:         %[[CY_LB:.*]] = arith.index_cast %[[CY_LB_IDX]] : index to i32
+// CHECK:         %[[LBC0:.*]] = arith.constant 0 : i32
+// CHECK:         %[[ISFST_LB:.*]] = arith.cmpi eq, %[[CY_LB]], %[[LBC0]] : i32
+// CHECK:         %[[OFF_LB:.*]] = arith.muli %[[CY_LB]], %[[CSZ_I32]] : i32
+// CHECK:         %[[OFFMW:.*]] = arith.subi %[[OFF_LB]], %[[WUP_I32]] : i32
+// CHECK:         %[[LB_I32:.*]] = arith.select %[[ISFST_LB]], %[[LBC0]], %[[OFFMW]] : i32
+// CHECK:         %[[LB:.*]] = arith.index_cast %[[LB_I32]] : i32 to index
+//
+// ── time_ub lowering ───────────────────────────────────────────────────
+// chunk_size / time_length are read from gpu.func args directly.
+//   ub_i32 = min((cy_i32 + 1) * chunk_size, time_length)
+//   ub     = index_cast ub_i32
+// CHECK:         %[[CY_UB_IDX:.*]] = gpu.block_id y
+// CHECK:         %[[CY_UB:.*]] = arith.index_cast %[[CY_UB_IDX]] : index to i32
+// CHECK:         %[[UBC1:.*]] = arith.constant 1 : i32
+// CHECK:         %[[CYP1:.*]] = arith.addi %[[CY_UB]], %[[UBC1]] : i32
+// CHECK:         %[[END:.*]] = arith.muli %[[CYP1]], %[[CSZ_I32]] : i32
+// CHECK:         %[[UB_I32:.*]] = arith.minui %[[END]], %[[TL_I32]] : i32
+// CHECK:         %[[UB:.*]] = arith.index_cast %[[UB_I32]] : i32 to index
+//
+// Resulting scf.for picks up the two bounds.
+// CHECK:         scf.for %{{.*}} = %[[LB]] to %[[UB]] step %{{.*}}
+// CHECK:         gpu.return
+//
+// The kungpu ops are illegal in the final IR.
+// CHECK-NOT:     kungpu.time_lb
+// CHECK-NOT:     kungpu.time_ub
+kunir.func @test_time_bounds(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
+  %c1 = arith.constant 1 : index
+  scf.for %t = %lb to %ub step %c1 {
+  }
+  kunir.return
+}
+
+
 // =====================================================================
 // Case 1 — gmem-only: signature change, time_length lowering, TxS GEPs.
 // =====================================================================
@@ -22,6 +93,9 @@ gpu.module @kungpu_kernels {
 // CHECK-LABEL: gpu.func @test_copy(
 // CHECK-SAME:    %[[TL:[^:]+]]: i32,
 // CHECK-SAME:    %[[NS:[^:]+]]: i32,
+// CHECK-SAME:    %[[MASK_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[CSZ_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[WUP_I32:[^:]+]]: i32,
 // CHECK-SAME:    %[[IN:[^:]+]]: !llvm.ptr,
 // CHECK-SAME:    %[[OUT:[^:]+]]: !llvm.ptr
 // kernel attribute is set, kunir-func metadata preserved as discardables:
@@ -30,6 +104,20 @@ gpu.module @kungpu_kernels {
 // CHECK-SAME:    kungpu.output_names = ["out"]
 // CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
 //
+// ── Per-function chunk write_start cache, lazily inserted at entry ────
+// All chunk arithmetic stays in i32 (64-bit ops are slow on GPU); only
+// the final write_start gets an index_cast for comparing against the
+// index-typed scf.for IV.  Mask cast (separate, used for t-mask subi
+// inside the loop) hoists to entry too.
+// CHECK:       %[[MASK:.*]] = arith.index_cast %[[MASK_I32]] : i32 to index
+// CHECK:       %[[CY_IDX:.*]] = gpu.block_id y
+// CHECK:       %[[CY:.*]] = arith.index_cast %[[CY_IDX]] : index to i32
+// CHECK:       %[[CYC0:.*]] = arith.constant 0 : i32
+// CHECK:       %[[ISFIRST:.*]] = arith.cmpi eq, %[[CY]], %[[CYC0]] : i32
+// CHECK:       %[[CYMUL:.*]] = arith.muli %[[CY]], %[[CSZ_I32]] : i32
+// CHECK:       %[[WSTART_I32:.*]] = arith.select %[[ISFIRST]], %[[MASK_I32]], %[[CYMUL]] : i32
+// CHECK:       %[[WSTART:.*]] = arith.index_cast %[[WSTART_I32]] : i32 to index
+//
 // ── Active-thread guard prologue ──────────────────────────────────────
 // Computes stock_id = bid*bdim + tid, compares with %num_stocks, then
 // wraps the original kernel body in scf.if so threads with
@@ -43,11 +131,15 @@ gpu.module @kungpu_kernels {
 // CHECK:       %[[ACTIVE:.*]] = arith.cmpi slt, %[[SIDI]], %[[NS]] : i32
 // CHECK:       scf.if %[[ACTIVE]] {
 //
-// time_length → arith.index_cast of arg0 (i32 → index)
-// CHECK:         %[[TLIDX:.*]] = arith.index_cast %[[TL]] : i32 to index
+// time_lb / time_ub lowering is verified in detail by @test_time_bounds
+// above.  Here we only assert that the scf.for picks up index-typed
+// bounds (which can only be the index_cast results of time_lb / time_ub
+// since chunk_size / warmup are i32).
+// CHECK:         %[[LB:.*]] = arith.index_cast %{{.*}} : i32 to index
+// CHECK:         %[[UB:.*]] = arith.index_cast %{{.*}} : i32 to index
 // CHECK:         %[[OFFCST:.*]] = arith.constant 0 : i32
 //
-// CHECK:         scf.for %[[T:.*]] = %{{.*}} to %[[TLIDX]] step %{{.*}}
+// CHECK:         scf.for %[[T:.*]] = %[[LB]] to %[[UB]] step %{{.*}}
 //
 // ── ts.get on global %in at offset 0 ───────────────────────────────────
 // effective time = t − 0; stock_id = bid*bdim + tid; lin = effT*ns + sid.
@@ -67,25 +159,28 @@ gpu.module @kungpu_kernels {
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[IN]][%[[LIN]]] {{.*}} -> !llvm.ptr, f32
 // CHECK:         %[[V:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> f32
 //
-// ── ts.put on global %out (no offset; writes at current iv) ───────────
-// CHECK:         %[[NS64B:.*]] = arith.extsi %[[NS]] : i32 to i64
-// CHECK:         %[[T64:.*]] = arith.index_cast %[[T]] : index to i64
-// CHECK:         %[[ROW2:.*]] = arith.muli %[[T64]], %[[NS64B]] : i64
-// CHECK:         %[[LIN2:.*]] = arith.addi %[[ROW2]],
-// CHECK:         %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
-// CHECK:         llvm.store %[[V]], %[[GEP2]]
+// ── ts.put on global %out: gated by t ≥ write_start, output index t-mask ──
+// CHECK:         %[[DOW:.*]] = arith.cmpi sge, %[[T]], %[[WSTART]] : index
+// CHECK:         scf.if %[[DOW]] {
+// CHECK:           %[[TOUT:.*]] = arith.subi %[[T]], %[[MASK]] : index
+// CHECK:           %[[NS64B:.*]] = arith.extsi %[[NS]] : i32 to i64
+// CHECK:           %[[T64:.*]] = arith.index_cast %[[TOUT]] : index to i64
+// CHECK:           %[[ROW2:.*]] = arith.muli %[[T64]], %[[NS64B]] : i64
+// CHECK:           %[[LIN2:.*]] = arith.addi %[[ROW2]],
+// CHECK:           %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
+// CHECK:           llvm.store %[[V]], %[[GEP2]]
 // scf.if + gpu.return: inactive threads (sid ≥ ns) skip the body and
 // arrive at gpu.return directly.
 // CHECK:       gpu.return
 kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} {
-  %tl = kungpu.time_length
-  %c0 = arith.constant 0 : index
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
   %c1 = arith.constant 1 : index
   %off = arith.constant 0 : i32
-  scf.for %t = %c0 to %tl step %c1 {
+  scf.for %t = %lb to %ub step %c1 {
     %v = kungpu.ts.get %in[%off] : !kunir.ts<f32, inf> -> f32
     kungpu.ts.put %out, %v : !kunir.ts<f32, 1>, f32
   }
@@ -145,13 +240,14 @@ kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 kunir.func @test_windowed_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
   %wt = kungpu.windowed_temp : !kunir.ts<f32, 5> {kungpu.smem = false}
-  %tl = kungpu.time_length
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %off0 = arith.constant 0 : i32
-  scf.for %t = %c0 to %tl step %c1 {
+  scf.for %t = %lb to %ub step %c1 {
     %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
     kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
     %off_idx = arith.subi %t, %c0 : index
@@ -199,13 +295,13 @@ kunir.func @test_windowed_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1
 kunir.func @test_windowed_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
   %wt = kungpu.windowed_temp : !kunir.ts<f32, 5> {kungpu.smem = true}
-  %tl = kungpu.time_length
-  %c0 = arith.constant 0 : index
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
   %c1 = arith.constant 1 : index
   %off0 = arith.constant 0 : i32
-  scf.for %t = %c0 to %tl step %c1 {
+  scf.for %t = %lb to %ub step %c1 {
     %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
     kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
     %w  = kungpu.ts.get %wt[%off0] : !kunir.ts<f32, 5> -> f32
@@ -229,7 +325,7 @@ kunir.func @test_windowed_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>
 kunir.func @test_indexing(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
   %sid = kungpu.stock_id
   %bsc = kungpu.block_stock_count
   %sum = arith.addi %sid, %bsc : index
diff --git a/mlir/test/kungpu/memory_planning.mlir b/mlir/test/kungpu/memory_planning.mlir
index aad77d9..be04dfe 100644
--- a/mlir/test/kungpu/memory_planning.mlir
+++ b/mlir/test/kungpu/memory_planning.mlir
@@ -28,7 +28,7 @@
 kunir.func @test_all_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
   // Declared in reverse order to verify sort-by-N behaviour.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 10> {kungpu.smem = true}
   %c = kungpu.windowed_temp : !kunir.ts<f32, 10>
@@ -47,7 +47,7 @@ kunir.func @test_all_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 kunir.func @test_mixed(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
   // N=400 (51200 bytes) is declared first but sorted after N=5 (640 bytes).
   // N=5 takes 640 bytes; N=400 would need 51200 more, exceeding 48512 remaining.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
@@ -65,7 +65,7 @@ kunir.func @test_mixed(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 kunir.func @test_all_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%in = "in"}
     outputs {%out = "out"}
-    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
   // N=400 → 51200 bytes > 49152, smem=false.
   // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
   %a = kungpu.windowed_temp : !kunir.ts<f32, 400>
diff --git a/mlir/test/kunir/basic.mlir b/mlir/test/kunir/basic.mlir
index 364854c..6519831 100644
--- a/mlir/test/kunir/basic.mlir
+++ b/mlir/test/kunir/basic.mlir
@@ -13,7 +13,7 @@ kunir.func @test_ts_lookback_type(
     %c: !kunir.ts<f64, 10>)
     inputs {%a = "a", %b = "b", %c = "c"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   kunir.return %b : !kunir.ts<f32, 1>
 }
@@ -22,7 +22,7 @@ kunir.func @test_ts_lookback_type(
 kunir.func @test_binary_mismatched_lookbacks(%a: !kunir.ts<f32, 5>, %b: !kunir.ts<f32, 10>)
     inputs {%a = "a", %b = "b"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   // CHECK: kunir.add
   // CHECK-SAME: <f32, 5>, <f32, 10>
@@ -38,7 +38,7 @@ kunir.func @test_binary_mismatched_lookbacks(%a: !kunir.ts<f32, 5>, %b: !kunir.t
 kunir.func @test_unary(%x: !kunir.ts<f32, inf>)
     inputs {%x = "x"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   // CHECK: kunir.abs
   %a = kunir.abs %x : !kunir.ts<f32, inf>
@@ -51,7 +51,7 @@ kunir.func @test_unary(%x: !kunir.ts<f32, inf>)
 kunir.func @test_windowed_output(%input: !kunir.ts<f32, inf>)
     inputs {%input = "input"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 10> {
   // CHECK: kunir.windowed_output
   // CHECK-SAME: length = 10
@@ -63,7 +63,7 @@ kunir.func @test_windowed_output(%input: !kunir.ts<f32, inf>)
 kunir.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
     inputs {%close = "close"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   // CHECK: kunir.for_each_back_window
   // CHECK-SAME: [window = 5]
@@ -84,7 +84,7 @@ kunir.func @test_for_each_back_window_multi_input(
     %vol:   !kunir.ts<f32, 20>)
     inputs {%close = "close", %vol = "vol"}
     outputs {"sum_close", "sum_vol"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
   // CHECK: kunir.for_each_back_window
   %sum_c, %sum_v = kunir.for_each_back_window
@@ -104,7 +104,7 @@ kunir.func @test_for_each_back_window_multi_input(
 kunir.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
     inputs {%input = "input"}
     outputs {"sum", "max"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
   %sum_ts, %max_ts = kunir.for_each_back_window
       (%input : !kunir.ts<f32, 20>) [window = 10]
@@ -123,7 +123,7 @@ kunir.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
 kunir.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
     inputs {%input = "input"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f64, 1> {
   %result = kunir.for_each_back_window
       (%input : !kunir.ts<f64, inf>) [window = 100]
@@ -139,7 +139,7 @@ kunir.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
 kunir.func @test_f64_binary(%a: !kunir.ts<f64, inf>, %b: !kunir.ts<f64, inf>)
     inputs {%a = "a", %b = "b"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f64, 1> {
   // CHECK: !kunir.ts<f64
   %result = kunir.max %a, %b : !kunir.ts<f64, inf>, !kunir.ts<f64, inf>
diff --git a/mlir/test/kunir/func.mlir b/mlir/test/kunir/func.mlir
index d93a95d..62fbce3 100644
--- a/mlir/test/kunir/func.mlir
+++ b/mlir/test/kunir/func.mlir
@@ -5,12 +5,12 @@
 // CHECK-SAME: (%[[A:.*]]: !kunir.ts<f32, inf>, %[[B:.*]]: !kunir.ts<f32, inf>)
 // CHECK:      inputs {%[[A]] = "close", %[[B]] = "vol"}
 // CHECK:      outputs {"alpha"}
-// CHECK:      target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+// CHECK:      target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
 // CHECK:      -> !kunir.ts<f32, 1>
 kunir.func @test_non_void(%close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>)
     inputs {%close = "close", %vol = "vol"}
     outputs {"alpha"}
-    target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   %sum = kunir.add %close, %vol : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
   kunir.return %sum : !kunir.ts<f32, 1>
@@ -21,12 +21,12 @@ kunir.func @test_non_void(%close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>
 // CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f32, inf>, %[[OUT:.*]]: !kunir.ts<f32, 1>)
 // CHECK:      inputs {%[[IN]] = "close"}
 // CHECK:      outputs {%[[OUT]] = "alpha"}
-// CHECK:      target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1}
+// CHECK:      target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1} unreliable_count = 0
 // CHECK-NOT:  ->
 kunir.func @test_void(%close: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
     inputs {%close = "close"}
     outputs {%out = "alpha"}
-    target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1} unreliable_count = 0 {
   kunir.return
 }
 
@@ -35,14 +35,14 @@ kunir.func @test_void(%close: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 // CHECK-SAME: (%[[I0:.*]]: !kunir.ts<f32, inf>, %[[I1:.*]]: !kunir.ts<f32, inf>, %[[O0:.*]]: !kunir.ts<f32, 1>, %[[O1:.*]]: !kunir.ts<f32, 1>)
 // CHECK:      inputs {%[[I0]] = "close", %[[I1]] = "vol"}
 // CHECK:      outputs {%[[O0]] = "alpha1", %[[O1]] = "alpha2"}
-// CHECK:      target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1}
+// CHECK:      target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
 // CHECK-NOT:  ->
 kunir.func @test_void_multi_output(
     %close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>,
     %out1: !kunir.ts<f32, 1>, %out2: !kunir.ts<f32, 1>)
     inputs {%close = "close", %vol = "vol"}
     outputs {%out1 = "alpha1", %out2 = "alpha2"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} {
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0 {
   kunir.return
 }
 
@@ -51,7 +51,7 @@ kunir.func @test_void_multi_output(
 kunir.func @test_multi_result(%input: !kunir.ts<f64, inf>)
     inputs {%input = "input"}
     outputs {"sum", "maxval"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 16384, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 16384, vector_size = 1} unreliable_count = 0
     -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
   %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
   %s, %m = kunir.for_each_back_window
diff --git a/mlir/test/kunir/lower_to_kungpu.mlir b/mlir/test/kunir/lower_to_kungpu.mlir
index fe6a7e9..bd8a6ad 100644
--- a/mlir/test/kunir/lower_to_kungpu.mlir
+++ b/mlir/test/kunir/lower_to_kungpu.mlir
@@ -1,6 +1,8 @@
 // RUN: %kun-opt --kunir-to-kungpu %s | %FileCheck %s
 
 // CHECK-LABEL: kunir.func @test_binary_lower
+// Pure ts args at this stage; the runtime scalars (time_length / num_stocks /
+// mask / chunk_size / warmup) are prepended later by convert-kungpu-to-llvm.
 // CHECK-SAME: !kunir.ts<f32, inf>
 // CHECK-SAME: !kunir.ts<f32, inf>
 // CHECK-SAME: !kunir.ts<f32, 1>
@@ -8,19 +10,24 @@
 kunir.func @test_binary_lower(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
     inputs {%a = "a", %b = "b"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
-  // CHECK:      %[[TL:.*]] = kungpu.time_length
+  // Outer for bounds come from the per-chunk lb/ub ops, not [0, T).
+  // Both are operandless — they pull chunk_size / warmup / time_length
+  // from gpu.func args at the kungpu-to-llvm stage.
+  // CHECK:      %[[LB:.*]] = kungpu.time_lb
+  // CHECK:      %[[UB:.*]] = kungpu.time_ub
   // CHECK:      %[[C0:.*]] = arith.constant 0 : index
   // CHECK:      %[[C1:.*]] = arith.constant 1 : index
   // outer-loop offset = 0 (i32) used by every gmem ts.get/put
   // CHECK:      %[[OFF:.*]] = arith.constant 0 : i32
-  // CHECK:      scf.for %{{.*}} = %[[C0]] to %[[TL]] step %[[C1]]
+  // CHECK:      scf.for %{{.*}} = %[[LB]] to %[[UB]] step %[[C1]]
   // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF]]]
   // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF]]]
   // CHECK:        arith.addf
   // CHECK:        kungpu.ts.put
   // CHECK-NOT:    kungpu.ts.put %{{.*}}[
+  // CHECK-NOT:    kungpu.time_length
   %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
   kunir.return %sum : !kunir.ts<f32, 1>
 }
@@ -29,7 +36,7 @@ kunir.func @test_binary_lower(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
 kunir.func @test_unary_lower(%x: !kunir.ts<f32, inf>)
     inputs {%x = "x"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   // CHECK: math.absf
   %a = kunir.abs %x : !kunir.ts<f32, inf>
@@ -40,7 +47,7 @@ kunir.func @test_unary_lower(%x: !kunir.ts<f32, inf>)
 kunir.func @test_windowed_sum(%close: !kunir.ts<f32, inf>)
     inputs {%close = "close"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   // CHECK:      %[[C0:.*]] = arith.constant 0 : index
   // CHECK:      %[[C1:.*]] = arith.constant 1 : index
@@ -73,7 +80,7 @@ kunir.func @test_windowed_sum(%close: !kunir.ts<f32, inf>)
 kunir.func @test_computed_reduce(%x: !kunir.ts<f32, inf>, %y: !kunir.ts<f32, inf>)
     inputs {%x = "x", %y = "y"}
     outputs {"result"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
     -> !kunir.ts<f32, 1> {
   // CHECK:      %[[WX:.*]] = kungpu.windowed_temp : <f32, 3>
   // CHECK:      %[[WY:.*]] = kungpu.windowed_temp : <f32, 3>
@@ -101,7 +108,7 @@ kunir.func @test_computed_reduce(%x: !kunir.ts<f32, inf>, %y: !kunir.ts<f32, inf
 kunir.func @test_multi_reduce(%input: !kunir.ts<f64, inf>)
     inputs {%input = "input"}
     outputs {"sum", "maxval"}
-    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
     -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
   // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f64, 10>
   // CHECK:      scf.for %[[T:.*]] =
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 98c8a92..ebc64e5 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -19,7 +19,7 @@
   kunir.func @test_addsum(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
       inputs {%a = "a", %b = "b"}
       outputs {"sum"}
-      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
       -> !kunir.ts<f32, 1> {
     %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
     kunir.return %s : !kunir.ts<f32, 1>
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index c1cc1a1..9b9ec60 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -27,7 +27,7 @@
   kunir.func @add_kernel(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
       inputs {%a = "a", %b = "b"}
       outputs {"tmp"}
-      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
       -> !kunir.ts<f32, 1> {
     %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
     kunir.return %s : !kunir.ts<f32, 1>
@@ -36,7 +36,7 @@
   kunir.func @scale_kernel(%t: !kunir.ts<f32, inf>, %c: !kunir.ts<f32, inf>)
       inputs {%t = "tmp", %c = "c"}
       outputs {"out"}
-      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
       -> !kunir.ts<f32, 1> {
     %s = kunir.mul %t, %c : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
     kunir.return %s : !kunir.ts<f32, 1>
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index 75afad8..f3a4f10 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -32,7 +32,7 @@ def build_ir(N: int, warps_per_cta: int = 4, smem_size: int = 49152) -> str:
   kunir.func @sum_window(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
       inputs {{%a = "a", %b = "b"}}
       outputs {{"out"}}
-      target {{occupancy = 1, warps_per_cta = {warps_per_cta}, smem_size = {smem_size}, vector_size = 1}}
+      target {{occupancy = 1, warps_per_cta = {warps_per_cta}, smem_size = {smem_size}, vector_size = 1}} unreliable_count = 0
       -> !kunir.ts<f32, 1> {{
     %c = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
     %w = kunir.windowed_output %c [length = {N}] : !kunir.ts<f32, 1> -> !kunir.ts<f32, {N}>

From 5cc38e32e7322543b6f999e76fc2131a2b31a063 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 13 May 2026 02:42:35 -0700
Subject: [PATCH 21/59] time slice (python side)

---
 mlir/include/KunCuda/Runtime.h         |  46 ++++++-
 mlir/lib/KunCuda/Runtime.cpp           | 175 ++++++++++++++++++++-----
 mlir/lib/KunGpu/KunGpuToLLVM.cpp       |  23 ++--
 mlir/lib/KunGpu/PtxBackend.cpp         |   1 +
 mlir/lib/Python/MlirBinding.cpp        |  55 ++++++--
 mlir/test/python/test_kun_to_cuda.py   | 171 ++++++++++++++++++++----
 mlir/test/python/test_windowed_temp.py |  11 +-
 7 files changed, 403 insertions(+), 79 deletions(-)

diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index 8ff5d47..6598822 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -78,6 +78,12 @@ struct KernelMeta {
   KernelKind kind = KernelKind::Jit;         ///< picked by the MLIR pass; default is the regular path
   std::vector<std::string> inputNames;       ///< kungpu.input_names, in argv order
   std::vector<std::string> outputNames;      ///< kungpu.output_names, in argv order
+  /// Per-partition warmup depth (kungpu.unreliable_count on the gpu.func).
+  /// Drives the time-chunk grid: chunks ≥ 1 need this many extra time
+  /// steps before they can start writing reliable outputs, and the
+  /// chunk-size heuristic gates the minimum chunk size at K × warmup.
+  /// Always 0 for external (cs_rank) kernels — they don't multi-chunk.
+  int64_t unreliableCount = 0;
 };
 
 /// What the compiler hands the runtime: a cubin + the kernels it
@@ -172,10 +178,31 @@ class Executable {
   ///
   /// Throws std::runtime_error on validation or driver errors.  This is
   /// a low-level entry point — most users go through `Executor::runGraph`.
+  /// Multi-chunk parameters (`mask`, `minChunkWarmupFactor`,
+  /// `smFillFactor`, `numSMs`) drive the time-axis chunk grid for JIT
+  /// kernels:
+  ///   - `mask` is the user-visible prefix-skip on graph outputs.  The
+  ///     output array's time dim is `timeLength - mask`; chunk 0 begins
+  ///     writes at `t == mask`.
+  ///   - `minChunkWarmupFactor` (≥ 1) gates the minimum chunk size at
+  ///     `factor * kernel.unreliableCount`, so the warmup-overlap
+  ///     region of a non-first chunk stays ≤ `1 / factor` of total
+  ///     compute.
+  ///   - `smFillFactor` (≥ 0) is the target `num_chunks * num_stock_tiles
+  ///     / numSMs`.  1.0 just fills the GPU; > 1 leaves slack for
+  ///     scheduler latency hiding.
+  ///   - `numSMs` is queried by `Executor` once at construction; pass 0
+  ///     to opt out of the smFillFactor heuristic (single-chunk mode).
+  /// External (cs_rank) kernels ignore these — they keep their original
+  /// `(time_length, num_stocks, ptrs...)` argv and time-major grid.
   void launchOnStream(int64_t timeLength, int64_t numStocks,
                        const std::vector<std::pair<std::string, uintptr_t>> &args,
                        CUstream stream,
-                       int devMaxSmemBytes);
+                       int devMaxSmemBytes,
+                       int64_t mask = 0,
+                       int minChunkWarmupFactor = 4,
+                       double smFillFactor = 1.5,
+                       int numSMs = 0);
 
 private:
   /// Allocate (or re-allocate, if shape changed) the intermediate slot
@@ -237,9 +264,19 @@ class Executor {
   /// Queue all kernels in `exe` on this executor's stream.  Async — does
   /// not synchronize.  Throws std::runtime_error on validation / driver
   /// errors.
+  ///
+  /// `mask` skips the first `mask` time rows of every output (output
+  /// time dim = `timeLength - mask`).  `minChunkWarmupFactor` and
+  /// `smFillFactor` shape the multi-chunk grid heuristic — see
+  /// `Executable::launchOnStream` for the meaning.  Defaults are tuned
+  /// to "fill the GPU with mild scheduler slack" while keeping warmup
+  /// overhead ≤ ~25%.
   void runGraph(Executable &exe,
                 int64_t timeLength, int64_t numStocks,
-                const std::vector<std::pair<std::string, uintptr_t>> &args);
+                const std::vector<std::pair<std::string, uintptr_t>> &args,
+                int64_t mask = 0,
+                int minChunkWarmupFactor = 4,
+                double smFillFactor = 1.5);
 
   /// Block until all queued work on this stream completes.
   void synchronize();
@@ -251,10 +288,15 @@ class Executor {
   /// Used to validate cs_rank dynamic-smem requests at launch time
   /// without a per-launch driver call.
   int devMaxSmemBytes() const noexcept { return devMaxSmemBytes_; }
+  /// Cached MULTIPROCESSOR_COUNT of the device this Executor's CUcontext
+  /// is bound to.  Used by `runGraph` for the chunk-grid heuristic
+  /// (target num_chunks × stock_tiles ≈ smFillFactor × numSMs).
+  int numSMs() const noexcept { return numSMs_; }
 
 private:
   CUstream stream_ = nullptr;
   int devMaxSmemBytes_ = 0;
+  int numSMs_ = 0;
 };
 
 } // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index fdffa05..e688e07 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -21,6 +21,8 @@
 
 #include <cuda.h>
 
+#include <algorithm>
+#include <cmath>
 #include <limits>
 #include <sstream>
 #include <stdexcept>
@@ -405,11 +407,13 @@ static std::vector<uintptr_t> resolveBufferPointers(
   return bufPtrs;
 }
 
-/// Stock-major launch: block_x = warps_per_cta*32, grid_x =
-/// ceil(numStocks / (block_x * vector_size)), no dynamic smem.
+/// Stock-major × time-chunk launch: block_x = warps_per_cta*32,
+/// grid_x = ceil(numStocks / (block_x * vector_size)),
+/// grid_y = numChunks, no dynamic smem.
 static void launchJitKernel(CUfunction fn,
                               int64_t numStocks,
                               int64_t warpsPerCta, int64_t vectorSize,
+                              unsigned numChunks,
                               void **args, CUstream stream) {
   unsigned blockX = static_cast<unsigned>(warpsPerCta * 32);
   uint64_t stocksPerBlock =
@@ -420,11 +424,89 @@ static void launchJitKernel(CUfunction fn,
   // sharedMemBytes = 0 — JIT'd kernels declare static smem via
   // llvm.mlir.global addr_space=3; the dynamic-smem launch parameter
   // does not apply.
-  checkCu(cuLaunchKernel(fn, gridX, 1, 1, blockX, 1, 1,
+  checkCu(cuLaunchKernel(fn, gridX, numChunks, 1, blockX, 1, 1,
                            /*sharedMemBytes=*/0, stream, args, nullptr),
            "cuLaunchKernel");
 }
 
+/// Chunk plan for a single JIT kernel.  `chunkSize` is the time-axis
+/// width of every chunk (last chunk gets clipped to `timeLength` by
+/// kungpu.time_ub at runtime, so we don't have to special-case that
+/// here).  `numChunks` is the y-dim of the launch grid.
+///
+/// Decision tree (per kernel, since per-partition `unreliableCount`
+/// varies):
+///
+///   1. target chunks   = ceil(smFillFactor * numSMs / stockTiles), ≥ 1
+///   2. cap by warmup   = floor(T / (factor * unreliableCount))
+///        — bounds the per-chunk overhead of chunks ≥ 1, which redo the
+///          trailing `unreliableCount` time steps to prime windowed
+///          rolling state.  mask is NOT included here: it's a one-time
+///          chunk-0 skip, not a per-chunk overhead.
+///   3. cap by mask     = floor((T - 1) / mask)
+///        — chunks ≥ 1 write output[t - mask] for t ∈ [cy*chunk_size, …);
+///          if chunk_size ≤ mask, chunk 1's first output index is
+///          negative (out-of-bounds gmem write).  Enforce chunk_size >
+///          mask by capping num_chunks here.
+///   4. numChunks       = clamp(target, 1, min(cap_warmup, cap_mask))
+///   5. chunkSize       = ceil(T / numChunks)
+///
+/// When both unreliable == 0 and mask == 0, the only cap is T itself.
+/// When numSMs == 0 (Executor couldn't query the device) or
+/// smFillFactor ≤ 0, fall back to single-chunk.
+struct ChunkPlan {
+  int64_t chunkSize;
+  unsigned numChunks;
+};
+static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
+                                     int64_t warpsPerCta, int64_t vectorSize,
+                                     int64_t unreliableCount, int64_t mask,
+                                     int minChunkWarmupFactor,
+                                     double smFillFactor, int numSMs) {
+  if (timeLength <= 0)
+    return {timeLength, 1u};
+  if (numSMs <= 0 || smFillFactor <= 0.0)
+    return {timeLength, 1u};
+
+  int64_t blockX = warpsPerCta * 32;
+  int64_t stocksPerBlock = blockX * vectorSize;
+  int64_t stockTiles =
+      (numStocks + stocksPerBlock - 1) / stocksPerBlock;
+  if (stockTiles <= 0) stockTiles = 1;
+
+  // Target chunks just to fill the GPU.  Round up so we don't under-fill.
+  int64_t targetChunks = static_cast<int64_t>(
+      std::ceil(smFillFactor * static_cast<double>(numSMs) /
+                  static_cast<double>(stockTiles)));
+  if (targetChunks < 1) targetChunks = 1;
+
+  // Caps on numChunks.  Start at T (degenerate upper bound: ≥ 1 step per
+  // chunk) and tighten with each constraint; clamp to ≥ 1 once at the end.
+  int64_t cap = timeLength;
+
+  // Per-chunk warmup overhead bound (chunks ≥ 1 only).
+  if (unreliableCount > 0 && minChunkWarmupFactor > 0)
+    cap = std::min<int64_t>(
+        cap,
+        timeLength /
+            (static_cast<int64_t>(minChunkWarmupFactor) * unreliableCount));
+
+  // chunkSize > mask: chunks ≥ 1 compute output index t - mask, which
+  // must be ≥ 0 for their writes.  chunk_size = ceil(T / numChunks);
+  // we want ceil(T / numChunks) > mask, equivalently numChunks ≤
+  // (T - 1) / mask.
+  if (mask > 0)
+    cap = std::min<int64_t>(cap, (timeLength - 1) / mask);
+
+  if (cap < 1) cap = 1;
+
+  int64_t numChunks = std::min<int64_t>(targetChunks, cap);
+  if (numChunks < 1) numChunks = 1;
+
+  int64_t chunkSize = (timeLength + numChunks - 1) / numChunks;
+  return {chunkSize, static_cast<unsigned>(numChunks)};
+}
+
 /// External cs_rank launch: block_x = warps_per_cta*32, grid_x =
 /// time_length (one CTA per timestep), sharedMemBytes = numStocks *
 /// sizeof(T).  Checks the request against the cached device cap so
@@ -711,14 +793,23 @@ void Executable::launchOnStream(
     int64_t timeLength, int64_t numStocks,
     const std::vector<std::pair<std::string, uintptr_t>> &args,
     CUstream stream,
-    int devMaxSmemBytes) {
-  // ── Shape sanity (kernel signature is i32, i32) ──────────────────
+    int devMaxSmemBytes,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor,
+    int numSMs) {
+  // ── Shape sanity (kernel signature is i32 across the board) ─────
   if (timeLength > std::numeric_limits<int32_t>::max() ||
       numStocks  > std::numeric_limits<int32_t>::max() ||
       timeLength < 0 || numStocks < 0)
     throw std::runtime_error(
         "kun_cuda::launchOnStream: time_length / num_stocks out of i32 "
         "range (kernel signature uses i32, i32)");
+  if (mask < 0 || (timeLength > 0 && mask >= timeLength))
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: mask must be in [0, time_length), got "
+        + std::to_string(mask) + " for time_length="
+        + std::to_string(timeLength));
   if (data_.warpsPerCta <= 0)
     throw std::runtime_error(
         "kun_cuda::launchOnStream: warps_per_cta is " +
@@ -731,33 +822,55 @@ void Executable::launchOnStream(
   const std::vector<uintptr_t> bufPtrs =
       resolveBufferPointers(*plan_, data_, args, slotBufs_);
 
-  // ── Launch each kernel in topo order on `stream`.  Async — the
-  //    caller (Executor) waits via cuStreamSynchronize. ─────────────
+  // ── Per-launch i32 scalars.  time_length / num_stocks / mask are
+  //    shared across every kernel; chunk_size / warmup vary per kernel
+  //    (chunk_size is derived from per-kernel unreliableCount). ──────
   int32_t timeLenI32   = static_cast<int32_t>(timeLength);
   int32_t numStocksI32 = static_cast<int32_t>(numStocks);
+  int32_t maskI32      = static_cast<int32_t>(mask);
 
   for (int kIdx : plan_->launchOrder) {
     const auto &ins  = plan_->kernelInputBufs[kIdx];
     const auto &outs = plan_->kernelOutputBufs[kIdx];
+    const auto &meta = data_.kernels[kIdx];
 
-    // Build the argv: (i32 T, i32 S, ins..., outs...) — same shape
-    // for Jit and external kernels.
     std::vector<CUdeviceptr> ptrs;
     ptrs.reserve(ins.size() + outs.size());
     for (int b : ins)  ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
     for (int b : outs) ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
-    std::vector<void *> argPtrs;
-    argPtrs.reserve(2 + ptrs.size());
-    argPtrs.push_back(&timeLenI32);
-    argPtrs.push_back(&numStocksI32);
-    for (auto &p : ptrs) argPtrs.push_back(&p);
 
-    const auto &meta = data_.kernels[kIdx];
     if (meta.kind == KernelKind::Jit) {
+      // JIT argv: (i32 T, i32 S, i32 mask, i32 chunk_size, i32 warmup,
+      //            ptrs...).  Chunk plan is per-kernel because each
+      //            kernel has its own unreliableCount.
+      ChunkPlan plan = computeChunkPlan(
+          timeLength, numStocks, data_.warpsPerCta, data_.vectorSize,
+          meta.unreliableCount, mask, minChunkWarmupFactor,
+          smFillFactor, numSMs);
+      int32_t chunkSizeI32 = static_cast<int32_t>(plan.chunkSize);
+      int32_t warmupI32    = static_cast<int32_t>(meta.unreliableCount);
+
+      std::vector<void *> argPtrs;
+      argPtrs.reserve(5 + ptrs.size());
+      argPtrs.push_back(&timeLenI32);
+      argPtrs.push_back(&numStocksI32);
+      argPtrs.push_back(&maskI32);
+      argPtrs.push_back(&chunkSizeI32);
+      argPtrs.push_back(&warmupI32);
+      for (auto &p : ptrs) argPtrs.push_back(&p);
+
       launchJitKernel(cuFuncs_[kIdx], numStocks,
                        data_.warpsPerCta, data_.vectorSize,
-                       argPtrs.data(), stream);
+                       plan.numChunks, argPtrs.data(), stream);
     } else {
+      // External cs_rank argv unchanged: (i32 T, i32 S, ptrs...).  These
+      // kernels are cross-sectional, time-major, and don't multi-chunk
+      // along time — the mask / chunk_size / warmup scalars don't apply.
+      std::vector<void *> argPtrs;
+      argPtrs.reserve(2 + ptrs.size());
+      argPtrs.push_back(&timeLenI32);
+      argPtrs.push_back(&numStocksI32);
+      for (auto &p : ptrs) argPtrs.push_back(&p);
       launchExtCsRankKernel(cuFuncs_[kIdx], meta.kind, meta.kernelName,
                               timeLength, numStocks, data_.warpsPerCta,
                               devMaxSmemBytes, argPtrs.data(), stream);
@@ -770,35 +883,37 @@ void Executable::launchOnStream(
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// Query the current CUcontext's device for
-/// MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.  Returns 0 if no context is
-/// current — the Executor accepts that and the launch path will only
-/// trip the check if the executable actually has external cs_rank
-/// kernels (in which case the user must have a context anyway).
-int queryDevMaxSmemBytes() {
+/// Query the current CUcontext's device for a single integer attribute.
+/// Returns 0 if no context is current — callers gate use on 0 == "unknown".
+int queryDevAttr(CUdevice_attribute attr) {
   CUcontext cur = nullptr;
   if (cuCtxGetCurrent(&cur) != CUDA_SUCCESS || !cur) return 0;
   CUdevice dev = 0;
   if (cuCtxGetDevice(&dev) != CUDA_SUCCESS) return 0;
   int v = 0;
-  if (cuDeviceGetAttribute(
-          &v, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev)
-      != CUDA_SUCCESS)
-    return 0;
+  if (cuDeviceGetAttribute(&v, attr, dev) != CUDA_SUCCESS) return 0;
   return v;
 }
 } // namespace
 
 Executor::Executor()
-    : stream_(nullptr), devMaxSmemBytes_(queryDevMaxSmemBytes()) {}
+    : stream_(nullptr),
+      devMaxSmemBytes_(
+          queryDevAttr(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)),
+      numSMs_(queryDevAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) {}
 Executor::Executor(CUstream stream)
-    : stream_(stream), devMaxSmemBytes_(queryDevMaxSmemBytes()) {}
+    : stream_(stream),
+      devMaxSmemBytes_(
+          queryDevAttr(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)),
+      numSMs_(queryDevAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) {}
 Executor::~Executor() = default;
 
 void Executor::runGraph(
     Executable &exe, int64_t timeLength, int64_t numStocks,
-    const std::vector<std::pair<std::string, uintptr_t>> &args) {
-  exe.launchOnStream(timeLength, numStocks, args, stream_, devMaxSmemBytes_);
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask, int minChunkWarmupFactor, double smFillFactor) {
+  exe.launchOnStream(timeLength, numStocks, args, stream_, devMaxSmemBytes_,
+                      mask, minChunkWarmupFactor, smFillFactor, numSMs_);
 }
 
 void Executor::synchronize() {
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 24ed311..94f84bc 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -685,7 +685,7 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
 // Algorithm — direct port of cpp/Kun/Ops.hpp::FastWindowedSum::step:
 //
 //   cur = input[t]                                                 ts.get  off=0
-//   old = (t >= window) ? input[t - window] : NaN                  ts.get  off=window  (guarded)
+//   old = (t - loop_lb >= window) ? input[t - window] : NaN        ts.get  off=window  (guarded)
 //   old_is_nan = isnan(old)
 //   new_is_nan = isnan(cur)
 //   v = old_is_nan ? v : kahanAdd(v, -old, &compSub)               // subtract old
@@ -693,10 +693,10 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
 //   numNans += (new_is_nan ? 1 : 0) - (old_is_nan ? 1 : 0)
 //   out = (numNans == 0) ? v : NaN
 //
-// The `t >= window` guard on `old` matches CPU's
-// `windowedRef`/`getWindow` which return NaN for index < window.
-// Without it, a function-arg gmem load at offset > t can fall before the
-// allocation start and segfault on some drivers.
+// Guard uses `t - loop_lb`, not bare `t`: state is per-CTA alloca
+// (zero-init) so each chunk needs its own N-step warmup with old=NaN
+// to build v up.  Chunk 0 has loop_lb = 0 so the guard collapses to
+// CPU's `t >= window`.
 //===----------------------------------------------------------------------===//
 
 struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
@@ -757,13 +757,16 @@ struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
         loc, i32Ty, rewriter.getI32IntegerAttr(window));
     Value cur = rewriter.create<TsGetOp>(loc, floatTy, origInput, zeroOff);
 
-    Value timeIdx = getCurrentTimeIdx(op);
-    if (!timeIdx)
+    auto forOp = op->getParentOfType<scf::ForOp>();
+    if (!forOp)
       return rewriter.notifyMatchFailure(
           op, "fast_windowed_sum must be inside a scf.for time loop");
-    Value windowIdx  = rewriter.create<arith::ConstantIndexOp>(loc, window);
-    Value tGeWindow  = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sge, timeIdx, windowIdx);
+    Value timeIdx   = forOp.getInductionVar();
+    Value loopLb    = forOp.getLowerBound();
+    Value localT    = rewriter.create<arith::SubIOp>(loc, timeIdx, loopLb);
+    Value windowIdx = rewriter.create<arith::ConstantIndexOp>(loc, window);
+    Value tGeWindow = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, localT, windowIdx);
 
     auto ifOp = rewriter.create<scf::IfOp>(
         loc, TypeRange{floatTy}, tGeWindow, /*withElseRegion=*/true);
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
index 91632b8..f8c4449 100644
--- a/mlir/lib/KunGpu/PtxBackend.cpp
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -167,6 +167,7 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
     if (auto outNames = getFuncOutputNames(f))
       for (auto a : outNames)
         km.outputNames.push_back(llvm::cast<StringAttr>(a).str());
+    km.unreliableCount = getFuncUnreliableCount(f);
 
     int64_t w = 1, v = 1;
     if (auto ts = getFuncTargetSpec(f)) {
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index edb9ca4..172fd50 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -194,11 +194,13 @@ struct CollectedArgs {
 
 static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
                                    nb::dict pyArgs,
-                                   const nb::object &streamArg) {
+                                   const nb::object &streamArg,
+                                   int64_t mask) {
   // Graph inputs come first, then outputs — same as the buffer-table
   // layout the runtime expects.
+  const size_t numInputs = exe.graphInputs().size();
   std::vector<std::string> ordered;
-  ordered.reserve(exe.graphInputs().size() + exe.graphOutputs().size());
+  ordered.reserve(numInputs + exe.graphOutputs().size());
   for (auto &n : exe.graphInputs())  ordered.push_back(n);
   for (auto &n : exe.graphOutputs()) ordered.push_back(n);
   if (ordered.empty())
@@ -229,9 +231,14 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
     }
   }
 
-  bool first = true;
+  // We need the input time length before validating any output (output
+  // time dim = input time dim − mask).  Walk inputs first to lock it
+  // in, then outputs.
+  out.timeLength = -1;
+  out.numStocks  = -1;
   for (size_t i = 0; i < ordered.size(); ++i) {
     const std::string &name = ordered[i];
+    bool isOutput = i >= numInputs;
 
     nb::object key = nb::str(name.c_str());
     if (!pyArgs.contains(key)) {
@@ -244,17 +251,24 @@ static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
                                 "' (kernel expects: " + expected + ")");
     }
     CudaArrayInfo info = readDLPack(pyArgs[key], name, streamArg);
-    if (first) {
+    int64_t expectT = isOutput ? (out.timeLength - mask) : out.timeLength;
+
+    if (out.timeLength < 0) {
+      // First arg is always an input (numInputs ≥ 1 since the kernel
+      // graph requires at least one input).  Lock in the launch shape.
       out.timeLength = info.timeLength;
       out.numStocks  = info.numStocks;
-      first = false;
-    } else if (info.timeLength != out.timeLength ||
+    } else if (info.timeLength != expectT ||
                  info.numStocks  != out.numStocks) {
       std::stringstream ss;
-      ss << "launch: shape mismatch on '" << name << "': expected ("
-         << out.timeLength << ", " << out.numStocks
-         << ") matching the first array, got ("
-         << info.timeLength << ", " << info.numStocks << ")";
+      ss << "launch: shape mismatch on '" << name
+         << "' (" << (isOutput ? "output" : "input") << "): expected ("
+         << expectT << ", " << out.numStocks
+         << "), got (" << info.timeLength << ", "
+         << info.numStocks << ")";
+      if (isOutput && mask > 0)
+        ss << " — output time dim must equal input time dim ("
+           << out.timeLength << ") minus mask (" << mask << ")";
       throw std::runtime_error(ss.str());
     }
     out.args.emplace_back(name, info.ptr);
@@ -449,16 +463,21 @@ NB_MODULE(KunMLIR, m) {
           "Raw stream handle as an int (0 ↔ CUDA default stream).")
       .def("runGraph",
           [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
-              nb::dict pyArgs) {
+              nb::dict pyArgs, int64_t mask,
+              int minChunkWarmupFactor, double smFillFactor) {
             // Thread the executor's stream into __dlpack__(stream=…)
             // so producers (CuPy / PyTorch / JAX / TF) can insert the
             // cross-stream sync needed for data-readiness on our
             // launch stream.
             nb::object streamArg = dlpackStreamArg(e.stream());
-            auto c = collectArgs(exe, pyArgs, streamArg);
-            e.runGraph(exe, c.timeLength, c.numStocks, c.args);
+            auto c = collectArgs(exe, pyArgs, streamArg, mask);
+            e.runGraph(exe, c.timeLength, c.numStocks, c.args,
+                        mask, minChunkWarmupFactor, smFillFactor);
           },
           nb::arg("exe"), nb::arg("args"),
+          nb::arg("mask") = 0,
+          nb::arg("min_chunk_warmup_factor") = 4,
+          nb::arg("sm_fill_factor") = 1.5,
           "Queue every kernel in `exe` onto this executor's stream.\n"
           "**Asynchronous** — call `.synchronize()` (or otherwise wait\n"
           "on the stream) before reading results back to host.\n"
@@ -468,6 +487,16 @@ NB_MODULE(KunMLIR, m) {
           "float32, 2-D, shape `(time_length, num_stocks)` (TS layout), "
           "and reside on the GPU.\n"
           "\n"
+          "`mask` is the prefix-skip on graph outputs: chunk 0 starts "
+          "writing at time index `mask`, so the output array's time "
+          "dim is `time_length - mask`.  Default 0 (no skip).\n"
+          "`min_chunk_warmup_factor` is the lower bound on "
+          "`chunk_size / warmup` — keeps warmup-overlap overhead below "
+          "`1 / factor` of total compute.  Default 4 (≤ 25% overhead).\n"
+          "`sm_fill_factor` is the target `num_chunks * stock_tiles / "
+          "numSMs`.  1.0 just fills the GPU; > 1 leaves scheduler "
+          "slack.  Default 1.5.\n"
+          "\n"
           "Named to match the CPU executor API "
           "(`KunRunner.runGraph(executor, mod, ...)`).")
       .def("synchronize", &kun_cuda::Executor::synchronize,
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index cfef71c..c37f565 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -105,6 +105,31 @@ def build_func_multipartition() -> Function:
     return Function(builder.ops, name="multi")
 
 
+def _compare_post_warmup(out_h: np.ndarray, expected: np.ndarray,
+                            valid_start: int, atol: float) -> int:
+    """Validate kernel output against the reference on rows
+    `[valid_start:]`.  Fails loudly on **any** NaN in the kernel
+    output past the warmup region — the naive `np.abs(NaN-x).max() >
+    atol` form silently returns False because NaN comparisons are
+    False, which would let a multi-chunk regression slip through.
+    """
+    tail = out_h[valid_start:]
+    if np.isnan(tail).any():
+        nrows = int(np.unique(np.where(np.isnan(tail))[0]).size)
+        print(f"  FAIL — {nrows} of {tail.shape[0]} validated rows "
+               f"contain NaN past row {valid_start}", file=sys.stderr)
+        return 1
+    diff = np.abs(tail - expected[valid_start:])
+    max_abs = float(diff.max())
+    if max_abs > atol:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e} at "
+               f"row {valid_start + idx[0]}, col {idx[1]}", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e})")
+    return 0
+
+
 def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
               atol: float = 1e-5) -> int:
     """Compile a Function, launch it, validate against numpy."""
@@ -226,16 +251,8 @@ def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
     if T > N:
         expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
 
-    diff = np.abs(out_h[N - 1:] - expected[N - 1:])
-    max_abs = float(diff.max())
-    atol = max(1e-3, 5e-7 * N)
-    if max_abs > atol:
-        idx = np.unravel_index(diff.argmax(), diff.shape)
-        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e} at {idx}",
-                file=sys.stderr)
-        return 1
-    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e})")
-    return 0
+    return _compare_post_warmup(out_h, expected, valid_start=N - 1,
+                                  atol=max(1e-3, 5e-7 * N))
 
 
 def run_multipartition(target: str, T: int, S: int) -> int:
@@ -327,23 +344,114 @@ def run_windowed(target: str, T: int, S: int, N: int) -> int:
     if T > N:
         expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
 
-    diff = np.abs(out_h[N - 1:] - expected[N - 1:])
-    max_abs = float(diff.max())
-    atol = max(1e-3, 5e-7 * N)
-    if max_abs > atol:
-        idx = np.unravel_index(diff.argmax(), diff.shape)
-        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e} at {idx}",
-                file=sys.stderr)
-        return 1
-    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e})")
-    return 0
+    return _compare_post_warmup(out_h, expected, valid_start=N - 1,
+                                  atol=max(1e-3, 5e-7 * N))
+
+
+def run_backref_with_mask(target: str, T: int, S: int, N: int,
+                              mask: int) -> int:
+    """Same BackRef(a+b, N) graph as `run_backref`, but driven with a
+    non-zero `mask`.  Picked over `WindowedSum` for the mask test
+    BackRef is stateless along the time axis (each output is a gmem
+    load at offset -N), so this case isolates the mask/warmup
+    interaction from any rolling-state concerns.  The windowed sum
+    counterpart below covers the stateful path.
+    """
+    print(f"=== backref + mask: out = (a+b)[t - {N}], mask={mask} ===")
+    assert 0 < mask < T, "test requires 0 < mask < T"
+    f = build_func_backref(N)
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(4)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    # Output time dim shrinks by mask.
+    out = cp.zeros((T - mask, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h),
+                              "b": cp.asarray(b_h), "out": out},
+                       mask=mask)
+    out_h = cp.asnumpy(out)
+
+    # Reference: out_full[t] = (a+b)[t-N] for t ≥ N; undefined for t < N.
+    # With mask, out_full[mask + i] lands at out_h[i].  Reliable when
+    # mask + i ≥ N, i.e., i ≥ max(0, N - mask).
+    c = a_h + b_h
+    valid_start = max(0, N - mask)
+    # Build a full-(T-mask) expected so _compare_post_warmup can validate
+    # the post-warmup tail uniformly (matches the windowed test below).
+    expected = np.empty((T - mask, S), dtype=np.float32)
+    expected[:valid_start] = np.nan
+    if valid_start < T - mask:
+        in_time = np.arange(mask + valid_start, T)
+        expected[valid_start:] = c[in_time - N]
+    return _compare_post_warmup(out_h, expected,
+                                  valid_start=valid_start, atol=1e-5)
+
+
+def run_windowed_with_mask(target: str, T: int, S: int, N: int,
+                              mask: int) -> int:
+    """`WindowedSum(a + b, N)` driven with mask — same graph as
+    `run_windowed`, but exercises the stateful `fast_windowed_sum`
+    lowering across multi-chunk + mask.  After the chunk-local guard
+    fix (`t - loop_lb ≥ window`), each chunk's per-CTA state primes
+    correctly through its warmup overlap and the post-warmup tail
+    matches the CPU reference at float-precision noise.
+    """
+    print(f"=== windowed + mask: ws = WindowedSum(a + b, N={N}), "
+           f"mask={mask} ===")
+    assert 0 < mask < T, "test requires 0 < mask < T"
+    f = build_func_windowed(N)
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(5)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T - mask, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h),
+                              "b": cp.asarray(b_h), "ws": out},
+                       mask=mask)
+    out_h = cp.asnumpy(out)
+
+    # Full-T reference, then slice from `mask` onward to align with
+    # the output's input-time origin.  Output row i = input time i+mask;
+    # reliable when i + mask ≥ N - 1.
+    c = a_h + b_h
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)
+    expected_full = np.empty((T, S), dtype=np.float32)
+    expected_full[:N - 1] = np.nan
+    expected_full[N - 1] = cumsum[N - 1]
+    if T > N:
+        expected_full[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+    expected = expected_full[mask:]
+    valid_start = max(0, N - 1 - mask)
+    return _compare_post_warmup(out_h, expected,
+                                  valid_start=valid_start,
+                                  atol=max(1e-3, 5e-7 * N))
 
 
 def main() -> int:
     ap = argparse.ArgumentParser()
     ap.add_argument("--target", default="sm_120")
-    ap.add_argument("-T", "--time-length", type=int, default=64)
-    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    # Defaults sized to comfortably trigger multi-chunk: T=128 with
+    # warmup=5 (N) gives `cap_warmup = 128/(4*5) = 6` chunks; S=1024
+    # gives `stock_tiles = 1024/(4*32) = 8`, so even on a small GPU
+    # the sm-fill target ≥ 2 — well inside the multi-chunk regime.
+    ap.add_argument("-T", "--time-length", type=int, default=128)
+    ap.add_argument("-S", "--num-stocks", type=int, default=1024)
     ap.add_argument("-N", "--window", type=int, default=5)
     args = ap.parse_args()
 
@@ -360,9 +468,28 @@ def main() -> int:
     print()
     rc |= run_backref(args.target, args.time_length, args.num_stocks, args.window)
     print()
+    # Mask smaller than the window, so the post-mask output still
+    # contains unreliable rows — exercises both warmup overlap (chunks
+    # ≥ 1 prime by reading back `unreliable_count` steps) AND the
+    # mask-skip-vs-warmup-skip distinction on chunk 0.  Two graphs:
+    # stateless BackRef and stateful WindowedSum / fast_windowed_sum.
+    rc |= run_backref_with_mask(args.target, args.time_length, args.num_stocks,
+                                  args.window, mask=3)
+    print()
+    rc |= run_windowed_with_mask(args.target, args.time_length, args.num_stocks,
+                                    args.window, mask=3)
+    print()
     rc |= run_fastwindowedsum(args.target, args.time_length, args.num_stocks,
                                 args.window)
     print()
+    # Single-chunk fallback corner case: warmup so large relative to T
+    # that `cap_warmup = T/(K*N) = 64/(4*20) = 0` clamps num_chunks to 1.
+    # Exercises the multi-chunk kernel binary in its degenerate
+    # grid_y=1 launch configuration — guards against regressions in
+    # time_lb / time_ub / write-gating when `chunk_size = T`.
+    rc |= run_windowed_with_mask(args.target, T=64, S=args.num_stocks,
+                                    N=20, mask=1)
+    print()
     rc |= run_multipartition(args.target, args.time_length, args.num_stocks)
     return rc
 
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index f3a4f10..9ccaef0 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -26,13 +26,20 @@
 
 
 def build_ir(N: int, warps_per_cta: int = 4, smem_size: int = 49152) -> str:
-    """A minimal kunir program that computes a rolling sum of (a + b)."""
+    """A minimal kunir program that computes a rolling sum of (a + b).
+
+    `unreliable_count = N` mirrors KunQuant's `infer_window` policy of
+    summing op windows along the chain: a single window-N reduction
+    contributes N.  The runtime uses this to back up `warmup` time
+    steps when launching chunks ≥ 1 so their rolling state is fully
+    primed before they start writing reliable outputs.
+    """
     return textwrap.dedent(f"""
 gpu.module @kungpu_kernels {{
   kunir.func @sum_window(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
       inputs {{%a = "a", %b = "b"}}
       outputs {{"out"}}
-      target {{occupancy = 1, warps_per_cta = {warps_per_cta}, smem_size = {smem_size}, vector_size = 1}} unreliable_count = 0
+      target {{occupancy = 1, warps_per_cta = {warps_per_cta}, smem_size = {smem_size}, vector_size = 1}} unreliable_count = {N}
       -> !kunir.ts<f32, 1> {{
     %c = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
     %w = kunir.windowed_output %c [length = {N}] : !kunir.ts<f32, 1> -> !kunir.ts<f32, {N}>

From 3203b112a664830e98758823106029ed6300ae6f Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 13 May 2026 19:03:14 -0700
Subject: [PATCH 22/59] time slice on rank

---
 mlir/include/KunCuda/Runtime.h      |  38 ++++++----
 mlir/lib/KunCuda/Runtime.cpp        |  74 +++++++++++++++----
 mlir/lib/KunCuda/kernels/cs_rank.cu | 111 +++++++++++++++-------------
 3 files changed, 141 insertions(+), 82 deletions(-)

diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index 6598822..c38ae78 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -53,6 +53,10 @@ namespace kun_cuda {
 /// producer maps, etc.  Fully defined in Runtime.cpp.
 struct GraphPlan;
 
+/// Forward-declared so `Executable::launchOnStream` can take an
+/// `Executor *` argument; the full definition lives below.
+class Executor;
+
 //===----------------------------------------------------------------------===//
 // Compile-time output (all names — runtime resolves them to indices)
 //===----------------------------------------------------------------------===//
@@ -93,7 +97,12 @@ struct KernelMeta {
 /// as an intermediate.
 struct ExecutableData {
   std::vector<char> cubin;
-  int64_t warpsPerCta = 1;          ///< from kungpu.target_spec (graph-wide)
+  int64_t warpsPerCta = 1;          ///< from kungpu.target_spec (graph-wide).
+                                     ///<   Drives JIT kernels' block_x.
+                                     ///<   External cs_rank kernels IGNORE
+                                     ///<   this — they auto-tune block_x
+                                     ///<   from numStocks (see
+                                     ///<   launchExtCsRankKernel).
   int64_t vectorSize  = 1;          ///< from kungpu.target_spec (graph-wide)
   std::vector<KernelMeta> kernels;  ///< unordered set; runtime topo-sorts
   std::vector<std::string> graphInputs;
@@ -179,8 +188,7 @@ class Executable {
   /// Throws std::runtime_error on validation or driver errors.  This is
   /// a low-level entry point — most users go through `Executor::runGraph`.
   /// Multi-chunk parameters (`mask`, `minChunkWarmupFactor`,
-  /// `smFillFactor`, `numSMs`) drive the time-axis chunk grid for JIT
-  /// kernels:
+  /// `smFillFactor`) drive the time-axis chunk grid for JIT kernels:
   ///   - `mask` is the user-visible prefix-skip on graph outputs.  The
   ///     output array's time dim is `timeLength - mask`; chunk 0 begins
   ///     writes at `t == mask`.
@@ -188,21 +196,21 @@ class Executable {
   ///     `factor * kernel.unreliableCount`, so the warmup-overlap
   ///     region of a non-first chunk stays ≤ `1 / factor` of total
   ///     compute.
-  ///   - `smFillFactor` (≥ 0) is the target `num_chunks * num_stock_tiles
-  ///     / numSMs`.  1.0 just fills the GPU; > 1 leaves slack for
-  ///     scheduler latency hiding.
-  ///   - `numSMs` is queried by `Executor` once at construction; pass 0
-  ///     to opt out of the smFillFactor heuristic (single-chunk mode).
-  /// External (cs_rank) kernels ignore these — they keep their original
-  /// `(time_length, num_stocks, ptrs...)` argv and time-major grid.
-  void launchOnStream(int64_t timeLength, int64_t numStocks,
+  ///   - `smFillFactor` (≥ 0) is the target chunks-on-GPU multiplier:
+  ///     JIT uses `num_chunks * stock_tiles ≥ smFillFactor * numSMs`;
+  ///     cs_rank uses `num_time_chunks ≥ smFillFactor * numSMs`.  1.0
+  ///     just fills the GPU; > 1 leaves slack for scheduler latency
+  ///     hiding.
+  /// `exec` owns the CUDA stream + the cached device attributes
+  /// (`devMaxSmemBytes()`, `numSMs()`).  External (cs_rank) kernels
+  /// ignore the multi-chunk params — they keep their own auto-tune
+  /// path using the same Executor accessors.
+  void launchOnStream(Executor *exec,
+                       int64_t timeLength, int64_t numStocks,
                        const std::vector<std::pair<std::string, uintptr_t>> &args,
-                       CUstream stream,
-                       int devMaxSmemBytes,
                        int64_t mask = 0,
                        int minChunkWarmupFactor = 4,
-                       double smFillFactor = 1.5,
-                       int numSMs = 0);
+                       double smFillFactor = 1.5);
 
 private:
   /// Allocate (or re-allocate, if shape changed) the intermediate slot
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index e688e07..9354d1e 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -507,16 +507,34 @@ static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
   return {chunkSize, static_cast<unsigned>(numChunks)};
 }
 
-/// External cs_rank launch: block_x = warps_per_cta*32, grid_x =
-/// time_length (one CTA per timestep), sharedMemBytes = numStocks *
-/// sizeof(T).  Checks the request against the cached device cap so
-/// we fail with a clear, GPU-aware message instead of letting
-/// cuLaunchKernel emit its generic error.
+/// External cs_rank launch.
+///
+/// Block / grid both auto-tuned — cs_rank is cross-sectional, so the
+/// graph-wide `warps_per_cta` hint doesn't apply.
+///
+///   blockX = clamp(round_up(numStocks, 32), 32, 1024)
+///       Each thread owns roughly one stock; when numStocks > 1024 the
+///       kernel falls back to its built-in `for (i = tid; i < S; i +=
+///       blockDim.x)` stride loop.
+///
+///   gridX  = min(timeLength, ceil(smFillFactor * numSMs))
+///       The kernel does a contiguous time-axis slice per CTA via a
+///       grid-stride loop (see kernels/cs_rank.cu).  For small T the
+///       min clamps to 1 CTA per timestep (matches the pre-tuning
+///       launch shape); for large T fewer CTAs each do more time
+///       steps, reducing launch / scheduling overhead.
+///
+///   smem   = numStocks * sizeof(T)  (one cross-section, reused across
+///                                     the CTA's time slice)
+///
+/// Falls back to (gridX = timeLength, blockX = 32) when the executor
+/// couldn't query `numSMs` from the device — degenerate "one CTA per
+/// timestep, one warp per CTA" still works correctly.
 static void launchExtCsRankKernel(CUfunction fn, KernelKind kind,
                                     const std::string &kernelName,
                                     int64_t timeLength, int64_t numStocks,
-                                    int64_t warpsPerCta,
                                     int devMaxSmemBytes,
+                                    double smFillFactor, int numSMs,
                                     void **args, CUstream stream) {
   size_t elemSize = (kind == KernelKind::ExtCsRankF64) ? 8u : 4u;
   uint64_t smemBytes64 =
@@ -541,8 +559,27 @@ static void launchExtCsRankKernel(CUfunction fn, KernelKind kind,
   if (timeLength <= 0)
     return; // empty time chunk — nothing to launch
 
-  unsigned blockX    = static_cast<unsigned>(warpsPerCta * 32);
-  unsigned gridX     = static_cast<unsigned>(timeLength);
+  constexpr int kWarp = 32;
+  constexpr int kMaxBlock = 1024;
+  int64_t blockX64 =
+      ((std::max<int64_t>(numStocks, 1) + kWarp - 1) / kWarp) * kWarp;
+  if (blockX64 > kMaxBlock) blockX64 = kMaxBlock;
+  unsigned blockX = static_cast<unsigned>(blockX64);
+
+  // Target gridX = sm_fill_factor * numSMs (capped at timeLength so we
+  // never launch idle CTAs).  numSMs == 0 (device query failed) →
+  // gridX = timeLength, one CTA per timestep.
+  unsigned gridX;
+  if (numSMs > 0 && smFillFactor > 0.0) {
+    int64_t target = static_cast<int64_t>(
+        std::ceil(smFillFactor * static_cast<double>(numSMs)));
+    if (target < 1) target = 1;
+    if (target > timeLength) target = timeLength;
+    gridX = static_cast<unsigned>(target);
+  } else {
+    gridX = static_cast<unsigned>(timeLength);
+  }
+
   unsigned smemBytes = static_cast<unsigned>(smemBytes64);
   checkCu(cuLaunchKernel(fn, gridX, 1, 1, blockX, 1, 1,
                            smemBytes, stream, args, nullptr),
@@ -790,14 +827,18 @@ int Executable::peakIntermediateSlots() const noexcept {
 }
 
 void Executable::launchOnStream(
+    Executor *exec,
     int64_t timeLength, int64_t numStocks,
     const std::vector<std::pair<std::string, uintptr_t>> &args,
-    CUstream stream,
-    int devMaxSmemBytes,
     int64_t mask,
     int minChunkWarmupFactor,
-    double smFillFactor,
-    int numSMs) {
+    double smFillFactor) {
+  if (!exec)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: Executor pointer is null");
+  CUstream stream      = exec->stream();
+  int devMaxSmemBytes  = exec->devMaxSmemBytes();
+  int numSMs           = exec->numSMs();
   // ── Shape sanity (kernel signature is i32 across the board) ─────
   if (timeLength > std::numeric_limits<int32_t>::max() ||
       numStocks  > std::numeric_limits<int32_t>::max() ||
@@ -872,8 +913,9 @@ void Executable::launchOnStream(
       argPtrs.push_back(&numStocksI32);
       for (auto &p : ptrs) argPtrs.push_back(&p);
       launchExtCsRankKernel(cuFuncs_[kIdx], meta.kind, meta.kernelName,
-                              timeLength, numStocks, data_.warpsPerCta,
-                              devMaxSmemBytes, argPtrs.data(), stream);
+                              timeLength, numStocks,
+                              devMaxSmemBytes, smFillFactor, numSMs,
+                              argPtrs.data(), stream);
     }
   }
 }
@@ -912,8 +954,8 @@ void Executor::runGraph(
     Executable &exe, int64_t timeLength, int64_t numStocks,
     const std::vector<std::pair<std::string, uintptr_t>> &args,
     int64_t mask, int minChunkWarmupFactor, double smFillFactor) {
-  exe.launchOnStream(timeLength, numStocks, args, stream_, devMaxSmemBytes_,
-                      mask, minChunkWarmupFactor, smFillFactor, numSMs_);
+  exe.launchOnStream(this, timeLength, numStocks, args,
+                      mask, minChunkWarmupFactor, smFillFactor);
 }
 
 void Executor::synchronize() {
diff --git a/mlir/lib/KunCuda/kernels/cs_rank.cu b/mlir/lib/KunCuda/kernels/cs_rank.cu
index abdeca1..dea2203 100644
--- a/mlir/lib/KunCuda/kernels/cs_rank.cu
+++ b/mlir/lib/KunCuda/kernels/cs_rank.cu
@@ -3,11 +3,18 @@
 //
 // Signature matches the project's launch convention so the executor can
 // pass the same `(i32 time_length, i32 num_stocks, in_ptr, out_ptr)`
-// arg tuple it uses for JIT'd kernels.  Grid/block/smem are chosen by
-// the executor at launch (time-major grid, one CTA per timestep,
-// dynamic smem = num_stocks * sizeof(T)).
+// arg tuple it uses for JIT'd kernels.
 //
-// Algorithm — pairwise O(N^2):
+// Launch shape (chosen by the executor):
+//   gridDim.x  = min(T, ceil(sm_fill_factor * numSMs))   // time chunks
+//   blockDim.x = clamp(round_up(num_stocks, 32), 32, 1024)
+//   smem       = num_stocks * sizeof(T)
+//
+// Each CTA processes a contiguous slice of time (`ceil(T/gridDim.x)`
+// steps), reusing its smem across the slice — time-contiguous so gmem
+// reads / writes stream cleanly through L2.
+//
+// Algorithm — pairwise O(N^2) per timestep:
 //   For each stock i with non-NaN value v,
 //     less  = #{ j : !isnan(u[j]) && u[j]  < v }
 //     equal = #{ j : !isnan(u[j]) && u[j] == v }    (includes i itself)
@@ -23,7 +30,7 @@
 // any count.
 
 #include <cuda_runtime.h>
-#include <math_constants.h>
+#include <math_constants.h>      // CUDART_NAN, CUDART_NAN_F
 
 // Dynamic shared memory base.  Declared at file scope (no anonymous
 // namespace) so it gets a stable, internal symbol rather than nvcc's
@@ -37,15 +44,6 @@ extern __shared__ unsigned char kun_cs_rank_smem[];
 
 namespace {
 
-template <typename T>
-__device__ static inline bool kun_isnan(T x);
-
-template <>
-__device__ inline bool kun_isnan<float>(float x) { return isnan(x); }
-
-template <>
-__device__ inline bool kun_isnan<double>(double x) { return isnan(x); }
-
 template <typename T>
 __device__ static inline T kun_nan();
 
@@ -55,54 +53,65 @@ __device__ inline float kun_nan<float>() { return CUDART_NAN_F; }
 template <>
 __device__ inline double kun_nan<double>() { return CUDART_NAN; }
 
-// Templated body — one CTA per timestep, threads cooperate across the
-// cross-section.
+// Templated body — each CTA processes a contiguous time-axis slice;
+// threads cooperate across the cross-section for every timestep in
+// the slice.
 template <typename T>
 __device__ static void cs_rank_body(const T* __restrict__ in,
                                     T* __restrict__ out,
                                     int time_length,
                                     int num_stocks) {
-    int t = blockIdx.x;
-    if (t >= time_length) return;
+    // Even split of [0, time_length) across gridDim.x.  Last CTA may
+    // have fewer (or zero) timesteps when gridDim.x doesn't divide T.
+    int time_per_cta = (time_length + gridDim.x - 1) / gridDim.x;
+    int t0 = blockIdx.x * time_per_cta;
+    int t1 = t0 + time_per_cta;
+    if (t1 > time_length) t1 = time_length;
+    if (t0 >= t1) return;
 
     T* smem = reinterpret_cast<T*>(kun_cs_rank_smem);
 
-    const T* row_in  = in  + static_cast<size_t>(t) * num_stocks;
-    T*       row_out = out + static_cast<size_t>(t) * num_stocks;
-
-    // 1) Cooperative load of the entire cross-section into smem.
-    for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
-        smem[i] = row_in[i];
-    }
-    __syncthreads();
-
-    // 2) Per-stock pairwise count.  Each thread owns a stride of stocks.
-    for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
-        T v = smem[i];
-        if (kun_isnan<T>(v)) {
-            row_out[i] = kun_nan<T>();
-            continue;
-        }
+    for (int t = t0; t < t1; ++t) {
+        const T* row_in  = in  + static_cast<size_t>(t) * num_stocks;
+        T*       row_out = out + static_cast<size_t>(t) * num_stocks;
 
-        int less  = 0;
-        int equal = 0;
-        int valid = 0;
-        for (int j = 0; j < num_stocks; ++j) {
-            T u = smem[j];
-            int is_valid = !kun_isnan<T>(u);
-            valid += is_valid;
-            less  += (is_valid & (u <  v));
-            equal += (is_valid & (u == v));
+        // 1) Cooperative load of this timestep's cross-section into smem.
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            smem[i] = row_in[i];
         }
-
-        if (valid == 0) {
-            row_out[i] = kun_nan<T>();
-            continue;
+        __syncthreads();
+
+        // 2) Per-stock pairwise count.  Each thread owns a stride of
+        //    stocks (= 1 stock when blockDim.x ≥ num_stocks).
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            T v = smem[i];
+            if (isnan(v)) {
+                row_out[i] = kun_nan<T>();
+                continue;
+            }
+
+            int less  = 0;
+            int equal = 0;
+            int valid = 0;
+            for (int j = 0; j < num_stocks; ++j) {
+                T u = smem[j];
+                int is_valid = !isnan(u);
+                valid += is_valid;
+                less  += (is_valid & (u <  v));
+                equal += (is_valid & (u == v));
+            }
+
+            if (valid == 0) {
+                row_out[i] = kun_nan<T>();
+                continue;
+            }
+            // Average-rank percentile, matching the CPU reference.
+            T num = static_cast<T>(2 * less + equal + 1);
+            T den = static_cast<T>(2 * valid);
+            row_out[i] = num / den;
         }
-        // Average-rank percentile, matching the CPU reference.
-        T num = static_cast<T>(2 * less + equal + 1);
-        T den = static_cast<T>(2 * valid);
-        row_out[i] = num / den;
+        // Re-sync before the next iteration overwrites smem.
+        __syncthreads();
     }
 }
 

From da52d622bce89ba03a2c10bebacabb0e44d575a9 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 13 May 2026 20:31:37 -0700
Subject: [PATCH 23/59] boolean ops

---
 KunQuant/passes/CodegenMLIR.py       |  11 +++
 mlir/include/KunIr/KunIrOps.h        |  23 +++++-
 mlir/include/KunIr/KunIrOps.td       |  79 +++++++++++++++++++++
 mlir/lib/KunGpu/Pipelines.cpp        |   6 ++
 mlir/lib/KunIr/KunIrOps.cpp          |  99 ++++++++++++++++++++++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp     |   6 ++
 mlir/lib/Python/IRBuilder.cpp        |  34 ++++++++-
 mlir/test/kungpu/cse_ts_get.mlir     |  31 +++++++++
 mlir/test/kunir/basic.mlir           |  35 ++++++++++
 mlir/test/python/test_kun_to_cuda.py | 100 +++++++++++++++++++++++++++
 10 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/kungpu/cse_ts_get.mlir

diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 891fc8f..77f8a59 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -26,6 +26,8 @@
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Sign,
+    GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
+    And, Or, Not, Select,
 )
 from KunQuant.ops.ReduceOp import (
     ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
@@ -39,9 +41,14 @@
 _BINARY = {
     Add: "add", Sub: "sub", Mul: "mul", Div: "div",
     Max: "max", Min: "min",
+    GreaterThan:  "gt", GreaterEqual: "ge",
+    LessThan:     "lt", LessEqual:    "le",
+    Equals:       "eq",
+    And:          "and_", Or:         "or_",
 }
 _UNARY = {
     Abs: "abs", Log: "log", Sign: "sign",
+    Not: "not_",
     # NOTE: `Rank` is intentionally absent.  Cross-sectional rank
     # partitions are routed to a pre-compiled CUmodule by
     # `_maybe_external_partition` below; they never become kunir ops.
@@ -119,6 +126,10 @@ def _emit_simple(op: OpBase, ir, val_map: Dict[OpBase, object]):
     if isinstance(op, FastWindowedSum):
         return ir.fast_windowed_sum(val_map[op.inputs[0]],
                                       int(op.attrs["window"]))
+    if isinstance(op, Select):
+        return ir.select(val_map[op.inputs[0]],
+                          val_map[op.inputs[1]],
+                          val_map[op.inputs[2]])
     raise NotImplementedError(
         f"CodegenMLIR: op type {cls.__name__} is not supported by the "
         f"GPU backend yet (op = {op})")
diff --git a/mlir/include/KunIr/KunIrOps.h b/mlir/include/KunIr/KunIrOps.h
index a45fabd..4398973 100644
--- a/mlir/include/KunIr/KunIrOps.h
+++ b/mlir/include/KunIr/KunIrOps.h
@@ -40,15 +40,36 @@ class KunIrElemwiseTsResultType
   }
 };
 
+/// Trait for comparison kunir ts ops (gt/ge/lt/le/eq).
+///
+/// Result is always !kunir.ts<i1, 1> regardless of operand element type.
+template <typename ConcreteType>
+class KunIrCmpTsResultType
+    : public TraitBase<ConcreteType, KunIrCmpTsResultType> {
+public:
+  static mlir::LogicalResult inferReturnTypes(
+      mlir::MLIRContext *ctx, std::optional<mlir::Location>,
+      mlir::ValueRange, mlir::DictionaryAttr,
+      mlir::PropertyRef, mlir::RegionRange,
+      llvm::SmallVectorImpl<mlir::Type> &inferred) {
+    inferred.push_back(
+        ::kunir::TsType::get(ctx, mlir::IntegerType::get(ctx, 1), 1));
+    return mlir::success();
+  }
+};
+
 } // namespace OpTrait
 } // namespace mlir
 
-// Convenient alias in the kunir namespace.
+// Convenient aliases in the kunir namespace.
 namespace kunir {
 namespace OpTrait {
 template <typename ConcreteType>
 using ElemwiseTsResultType =
     ::mlir::OpTrait::KunIrElemwiseTsResultType<ConcreteType>;
+template <typename ConcreteType>
+using CmpTsResultType =
+    ::mlir::OpTrait::KunIrCmpTsResultType<ConcreteType>;
 } // namespace OpTrait
 } // namespace kunir
 
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 8829abe..b03d937 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -83,6 +83,85 @@ def KunIr_SignOp : KunIr_UnaryElemwiseOp<"sign"> {
   let summary = "Element-wise sign (-1, 0, 1)";
 }
 
+//===----------------------------------------------------------------------===//
+// Logical ops on i1 ts
+//
+// And/Or are binary, Not is unary.  Operand element types must be i1 and the
+// result is always ts<i1, 1>.  Verifier enforces the i1 constraint; the
+// elemwise traits above already give us the right result-type inference
+// (result = ts<input.elem, 1> = ts<i1, 1> when input is i1).
+//===----------------------------------------------------------------------===//
+
+def KunIr_AndOp : KunIr_BinaryElemwiseOp<"and"> {
+  let summary = "Element-wise logical AND on ts<i1>";
+}
+def KunIr_OrOp  : KunIr_BinaryElemwiseOp<"or"> {
+  let summary = "Element-wise logical OR on ts<i1>";
+}
+def KunIr_NotOp : KunIr_UnaryElemwiseOp<"not"> {
+  let summary = "Element-wise logical NOT on ts<i1>";
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison ops
+//
+// Operands are two ts values with matching element types; result is always
+// !kunir.ts<i1, 1>.  Lowering dispatches arith.cmpf for float operands and
+// arith.cmpi for integer operands.
+//===----------------------------------------------------------------------===//
+
+class KunIr_BinaryCmpOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        InferTypeOpInterface,
+        NativeOpTrait<"KunIrCmpTsResultType">,
+        DeclareOpInterfaceMethods<KunIr_BinaryArithInterface>
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$lhs, KunIr_AnyTs:$rhs);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$lhs `,` $rhs `:` type($lhs) `,` type($rhs) attr-dict";
+}
+
+def KunIr_GreaterOp      : KunIr_BinaryCmpOp<"gt"> {
+  let summary = "Element-wise greater-than (lhs > rhs)";
+}
+def KunIr_GreaterEqualOp : KunIr_BinaryCmpOp<"ge"> {
+  let summary = "Element-wise greater-or-equal (lhs >= rhs)";
+}
+def KunIr_LessOp         : KunIr_BinaryCmpOp<"lt"> {
+  let summary = "Element-wise less-than (lhs < rhs)";
+}
+def KunIr_LessEqualOp    : KunIr_BinaryCmpOp<"le"> {
+  let summary = "Element-wise less-or-equal (lhs <= rhs)";
+}
+def KunIr_EqualOp        : KunIr_BinaryCmpOp<"eq"> {
+  let summary = "Element-wise equality (lhs == rhs)";
+}
+
+//===----------------------------------------------------------------------===//
+// Select op — cond ? true_value : false_value
+//
+// cond is ts<i1, *>; true_value / false_value share the same element type T.
+// Result is ts<T, 1>.
+//===----------------------------------------------------------------------===//
+
+def KunIr_SelectOp : KunIr_Op<"select", [
+    Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>
+]> {
+  let summary = "Element-wise select on three ts values";
+  let arguments = (ins KunIr_AnyTs:$cond,
+                       KunIr_AnyTs:$true_value,
+                       KunIr_AnyTs:$false_value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$cond `,` $true_value `,` $false_value `:` type($cond) `,` "
+    "type($true_value) `,` type($false_value) attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // Cross-sectional ops
 //
diff --git a/mlir/lib/KunGpu/Pipelines.cpp b/mlir/lib/KunGpu/Pipelines.cpp
index 6a87db7..27249e8 100644
--- a/mlir/lib/KunGpu/Pipelines.cpp
+++ b/mlir/lib/KunGpu/Pipelines.cpp
@@ -38,6 +38,12 @@ void buildKunIrToLLVMPipeline(OpPassManager &pm) {
   {
     OpPassManager &gpuModPM = pm.nest<gpu::GPUModuleOp>();
     gpuModPM.addNestedPass<::kunir::FuncOp>(::kunir::createKunIrToKunGpuPass());
+    // CSE here — kungpu.ts.get is Pure, so any duplicates emitted by the
+    // lowering above (e.g. two back_refs reading the same input at the same
+    // offset, or any other path where distinct kunir SSA values produce
+    // identical ts.get) collapse to a single load before
+    // windowed-temp-memory-planning + convert-kungpu-to-llvm see them.
+    gpuModPM.addNestedPass<::kunir::FuncOp>(createCSEPass());
     gpuModPM.addNestedPass<::kunir::FuncOp>(
         ::kungpu::createWindowedTempMemoryPlanningPass());
   }
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 771184d..0c3719e 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -62,6 +62,24 @@ LogicalResult MulOp::verify() { return verifyBinaryElemwise(*this, getLhs(), get
 LogicalResult DivOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
 LogicalResult MaxOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
 LogicalResult MinOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult EqualOp::verify()        { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult GreaterOp::verify()      { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult GreaterEqualOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult LessOp::verify()         { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult LessEqualOp::verify()    { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+
+// Logical ops also require both operands to be i1 ts.
+static LogicalResult verifyLogicalBinary(Operation *op, Value lhs, Value rhs) {
+  if (failed(verifyBinaryElemwise(op, lhs, rhs)))
+    return failure();
+  auto elemTy = llvm::cast<TsType>(lhs.getType()).getElementType();
+  if (!elemTy.isInteger(1))
+    return op->emitOpError("operand element type must be i1, got '")
+           << elemTy << "'";
+  return success();
+}
+LogicalResult AndOp::verify() { return verifyLogicalBinary(*this, getLhs(), getRhs()); }
+LogicalResult OrOp::verify()  { return verifyLogicalBinary(*this, getLhs(), getRhs()); }
 
 //===----------------------------------------------------------------------===//
 // Unary elemwise ops — verify only
@@ -71,6 +89,43 @@ LogicalResult AbsOp::verify()  { return success(); }
 LogicalResult LogOp::verify()  { return success(); }
 LogicalResult SignOp::verify() { return success(); }
 
+LogicalResult NotOp::verify() {
+  auto elemTy = llvm::cast<TsType>(getInput().getType()).getElementType();
+  if (!elemTy.isInteger(1))
+    return emitOpError("operand element type must be i1, got '")
+           << elemTy << "'";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp — cond must be ts<i1, *>; true/false must share elem type.
+//===----------------------------------------------------------------------===//
+
+LogicalResult SelectOp::verify() {
+  auto condTy  = llvm::cast<TsType>(getCond().getType());
+  auto trueTy  = llvm::cast<TsType>(getTrueValue().getType());
+  auto falseTy = llvm::cast<TsType>(getFalseValue().getType());
+  if (!condTy.getElementType().isInteger(1))
+    return emitOpError("cond element type must be i1, got '")
+           << condTy.getElementType() << "'";
+  if (trueTy.getElementType() != falseTy.getElementType())
+    return emitOpError("true_value element type '")
+           << trueTy.getElementType()
+           << "' must match false_value element type '"
+           << falseTy.getElementType() << "'";
+  return success();
+}
+
+// Result type: ts<true_value.elem, 1>.
+LogicalResult SelectOp::inferReturnTypes(
+    MLIRContext *ctx, std::optional<Location>, ValueRange operands,
+    DictionaryAttr, PropertyRef, RegionRange,
+    SmallVectorImpl<Type> &inferred) {
+  auto trueTy = llvm::cast<TsType>(operands[1].getType());
+  inferred.push_back(TsType::get(ctx, trueTy.getElementType(), 1));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // WindowedOutputOp
 //===----------------------------------------------------------------------===//
@@ -373,6 +428,44 @@ Value MinOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
   return b.create<arith::MinimumFOp>(loc, lhs, rhs);
 }
 
+// Comparison ops: dispatch arith.cmpf for FloatType operands and
+// arith.cmpi for IntegerType operands.  Verifier guarantees lhs.type == rhs.type.
+static Value buildCmpScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs,
+                              arith::CmpFPredicate fp,
+                              arith::CmpIPredicate ip) {
+  if (llvm::isa<FloatType>(lhs.getType()))
+    return b.create<arith::CmpFOp>(loc, fp, lhs, rhs);
+  return b.create<arith::CmpIOp>(loc, ip, lhs, rhs);
+}
+Value GreaterOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OGT, arith::CmpIPredicate::sgt);
+}
+Value GreaterEqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OGE, arith::CmpIPredicate::sge);
+}
+Value LessOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OLT, arith::CmpIPredicate::slt);
+}
+Value LessEqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OLE, arith::CmpIPredicate::sle);
+}
+Value EqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OEQ, arith::CmpIPredicate::eq);
+}
+
+// Logical binary ops on i1.
+Value AndOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::AndIOp>(loc, lhs, rhs);
+}
+Value OrOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return b.create<arith::OrIOp>(loc, lhs, rhs);
+}
+
 //===----------------------------------------------------------------------===//
 // UnaryArithInterface implementations
 //===----------------------------------------------------------------------===//
@@ -389,6 +482,12 @@ Value SignOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
       loc, operand.getType(), b.getFloatAttr(operand.getType(), 1.0));
   return b.create<math::CopySignOp>(loc, one, operand);
 }
+Value NotOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  // not(x) = x ^ 1 on i1
+  Value one = b.create<arith::ConstantOp>(loc, b.getI1Type(),
+                                            b.getIntegerAttr(b.getI1Type(), 1));
+  return b.create<arith::XOrIOp>(loc, operand, one);
+}
 
 //===----------------------------------------------------------------------===//
 // ReduceArithInterface implementations
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 0577059..5048b71 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -101,6 +101,12 @@ static LogicalResult lowerBlock(
              && "reduce result must be pre-seeded in tsMap with current acc");
       it->second = {TsKind::Scalar,
           ri.buildAccumOp(b, ol, it->second.value, elem)};
+    } else if (auto sel = dyn_cast<SelectOp>(op)) {
+      Value cond  = getScalar(sel.getCond(),       tsMap, offsetI32, b, ol);
+      Value tv    = getScalar(sel.getTrueValue(),  tsMap, offsetI32, b, ol);
+      Value fv    = getScalar(sel.getFalseValue(), tsMap, offsetI32, b, ol);
+      tsMap[sel.getResult()] = {TsKind::Scalar,
+          b.create<arith::SelectOp>(ol, cond, tv, fv).getResult()};
     } else if (handleUnknown) {
       if (failed(handleUnknown(*op))) return failure();
     } else {
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index d1970b4..570382e 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -56,9 +56,11 @@ class IRBuilder {
       elem = b_.getF32Type();
     else if (elemDtype == "f64" || elemDtype == "double")
       elem = b_.getF64Type();
+    else if (elemDtype == "i1" || elemDtype == "bool")
+      elem = b_.getI1Type();
     else
       throw std::runtime_error("IRBuilder.ts_type: unsupported elem dtype '" +
-                                 elemDtype + "' (expected f32/f64)");
+                                 elemDtype + "' (expected f32/f64/i1)");
     uint64_t lb = lookback == 0 ? std::numeric_limits<uint64_t>::max()
                                   : static_cast<uint64_t>(lookback);
     return kunir::TsType::get(pm_->ctx.get(), elem, lb);
@@ -148,6 +150,21 @@ class IRBuilder {
   Value logOp(Value x)  { return makeUn<kunir::LogOp>(x); }
   Value signOp(Value x) { return makeUn<kunir::SignOp>(x); }
 
+  // ── Comparison + logical (binary, return ts<i1, 1>) ─────────────
+  Value gtOp(Value a, Value b) { return makeBin<kunir::GreaterOp>(a, b); }
+  Value geOp(Value a, Value b) { return makeBin<kunir::GreaterEqualOp>(a, b); }
+  Value ltOp(Value a, Value b) { return makeBin<kunir::LessOp>(a, b); }
+  Value leOp(Value a, Value b) { return makeBin<kunir::LessEqualOp>(a, b); }
+  Value eqOp(Value a, Value b) { return makeBin<kunir::EqualOp>(a, b); }
+  Value andOp(Value a, Value b) { return makeBin<kunir::AndOp>(a, b); }
+  Value orOp(Value a, Value b)  { return makeBin<kunir::OrOp>(a, b); }
+  Value notOp(Value x) { return makeUn<kunir::NotOp>(x); }
+
+  // ── Select (cond, true_value, false_value) ──────────────────────
+  Value selectOp(Value cond, Value tv, Value fv) {
+    return b_.create<kunir::SelectOp>(b_.getUnknownLoc(), cond, tv, fv);
+  }
+
   // ── Windowed buffer materialization ───────────────────────────────
   Value windowedOutputOp(Value x, int64_t length) {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
@@ -323,6 +340,21 @@ void registerIRBuilder(nb::module_ &m) {
       .def("log",    &IRBuilder::logOp,    nb::arg("x"))
       .def("sign",   &IRBuilder::signOp,   nb::arg("x"))
 
+      // Comparison + logical (binary). Cmp ops return ts<i1, 1>;
+      // and/or expect ts<i1, *> operands and also return ts<i1, 1>.
+      .def("gt",     &IRBuilder::gtOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("ge",     &IRBuilder::geOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("lt",     &IRBuilder::ltOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("le",     &IRBuilder::leOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("eq",     &IRBuilder::eqOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("and_",   &IRBuilder::andOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("or_",    &IRBuilder::orOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("not_",   &IRBuilder::notOp,    nb::arg("x"))
+
+      // Select: cond ? true_value : false_value
+      .def("select", &IRBuilder::selectOp,
+            nb::arg("cond"), nb::arg("true_value"), nb::arg("false_value"))
+
       // Windowed materialization
       .def("windowed_output", &IRBuilder::windowedOutputOp,
             nb::arg("x"), nb::arg("length"))
diff --git a/mlir/test/kungpu/cse_ts_get.mlir b/mlir/test/kungpu/cse_ts_get.mlir
new file mode 100644
index 0000000..be4522a
--- /dev/null
+++ b/mlir/test/kungpu/cse_ts_get.mlir
@@ -0,0 +1,31 @@
+// RUN: %kun-opt --pass-pipeline='builtin.module(gpu.module(kunir.func(kunir-to-kungpu,cse)))' %s | %FileCheck %s
+//
+// Verify that the CSE pass placed between `kunir-to-kungpu` and
+// `windowed-temp-memory-planning` deduplicates identical kungpu.ts.get
+// loads.  kungpu.ts.get is marked Pure, so CSE collapses any pair of
+// reads with the same (handle, offset) operands.
+//
+// Two distinct kunir.back_ref ops on the same windowed_output at the
+// same window lower to two ts.get %wt[%c5_i32] inside the outer time
+// loop; after CSE only one survives.
+
+gpu.module @kungpu_kernels {
+  // CHECK-LABEL: kunir.func @two_back_refs
+  kunir.func @two_back_refs(%a: !kunir.ts<f32, inf>)
+      inputs {%a = "a"}
+      outputs {"out"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 5
+      -> !kunir.ts<f32, 1> {
+    %r1 = kunir.back_ref %a [window = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 1>
+    %r2 = kunir.back_ref %a [window = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 1>
+    %sum = kunir.add %r1, %r2 : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+    kunir.return %sum : !kunir.ts<f32, 1>
+  }
+}
+
+// One load of %a at offset 0 for the windowed_output fill (outer loop),
+// and exactly ONE load of the windowed_temp at offset 5 (the two
+// back_refs collapsed via CSE) — for a total of two ts.get ops.
+//
+// CHECK:       kungpu.ts.get
+// CHECK-NOT:   kungpu.ts.get
diff --git a/mlir/test/kunir/basic.mlir b/mlir/test/kunir/basic.mlir
index 6519831..d19ffac 100644
--- a/mlir/test/kunir/basic.mlir
+++ b/mlir/test/kunir/basic.mlir
@@ -145,3 +145,38 @@ kunir.func @test_f64_binary(%a: !kunir.ts<f64, inf>, %b: !kunir.ts<f64, inf>)
   %result = kunir.max %a, %b : !kunir.ts<f64, inf>, !kunir.ts<f64, inf>
   kunir.return %result : !kunir.ts<f64, 1>
 }
+
+// CHECK-LABEL: kunir.func @test_cmp_logical_select
+kunir.func @test_cmp_logical_select(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"gt_out", "lt_out", "eq_out", "and_out", "or_out", "not_out"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>,
+        !kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+  // CHECK: kunir.gt
+  %gt = kunir.gt %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.lt
+  %lt = kunir.lt %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.ge
+  %ge = kunir.ge %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.le
+  %le = kunir.le %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.eq
+  %eq = kunir.eq %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.and
+  %and = kunir.and %gt, %lt : !kunir.ts<i1, 1>, !kunir.ts<i1, 1>
+  // CHECK: kunir.or
+  %or  = kunir.or  %ge, %le : !kunir.ts<i1, 1>, !kunir.ts<i1, 1>
+  // CHECK: kunir.not
+  %nt  = kunir.not %lt : !kunir.ts<i1, 1>
+  // CHECK: kunir.select
+  %s_gt  = kunir.select %gt,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_lt  = kunir.select %lt,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_eq  = kunir.select %eq,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_and = kunir.select %and, %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_or  = kunir.select %or,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_nt  = kunir.select %nt,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  kunir.return %s_gt, %s_lt, %s_eq, %s_and, %s_or, %s_nt
+    : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>,
+      !kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+}
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index c37f565..64dc9f3 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -28,6 +28,10 @@
 
 from KunQuant.Op import Builder, Input, Output
 from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum
+from KunQuant.ops.ElewiseOp import (
+    GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
+    And, Or, Not, Select,
+)
 from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
 from KunQuant.Stage import Function
 from KunQuant.jit import KunMLIR
@@ -90,6 +94,41 @@ def build_func_fastwindowedsum(N: int) -> Function:
     return Function(builder.ops, name="fastwindowedsum_kernel")
 
 
+def build_func_cmp_logical() -> Function:
+    """Single-graph multi-output factor that exercises every kunir cmp,
+    logical, and select op in one shot:
+
+      gt_out  = a > b  ? a : b              # element-wise max
+      lt_out  = a < b  ? a : b              # element-wise min
+      ge_out  = a >= b ? a : b              # max (tiebreaks to a)
+      le_out  = a <= b ? a : b              # min (tiebreaks to a)
+      eq_out  = a == b ? a : b              # always a where they match
+      and_out = (a > 0)  & (b > 0)  ? a : b # gt + and
+      or_out  = (a > 0)  | (b > 0)  ? a : b # gt + or
+      not_out = !(a > b) ? a : b            # = (a <= b) ? a : b
+
+    Constants 0 are produced via `Sub(a, a)`-style identities to stay
+    inside the ops supported by CodegenMLIR (no ConstantOp on the GPU
+    path yet — and we don't need one for this test).
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        zero = Sub(a, a)  # = 0 elementwise (avoids ConstantOp dependency)
+        Output(Select(GreaterThan(a, bin_), a, bin_), "gt_out")
+        Output(Select(LessThan(a, bin_),    a, bin_), "lt_out")
+        Output(Select(GreaterEqual(a, bin_), a, bin_), "ge_out")
+        Output(Select(LessEqual(a, bin_),    a, bin_), "le_out")
+        Output(Select(Equals(a, bin_),       a, bin_), "eq_out")
+        Output(Select(And(GreaterThan(a, zero), GreaterThan(bin_, zero)),
+                       a, bin_), "and_out")
+        Output(Select(Or(GreaterThan(a, zero), GreaterThan(bin_, zero)),
+                       a, bin_), "or_out")
+        Output(Select(Not(GreaterThan(a, bin_)), a, bin_), "not_out")
+    return Function(builder.ops, name="cmp_logical_kernel")
+
+
 def build_func_multipartition() -> Function:
     """A graph with three independent outputs.  Combined with
     `partition_factor=1` this drives `do_partition` to split into
@@ -312,6 +351,65 @@ def run_multipartition(target: str, T: int, S: int) -> int:
     return 0
 
 
+def run_cmp_logical(target: str, T: int, S: int) -> int:
+    """End-to-end test for kunir.gt/ge/lt/le/eq + and/or/not + select.
+
+    Verifies a single graph with eight outputs against the obvious numpy
+    reference.  Exercises both bool-producing ops (cmp) and bool-consuming
+    ops (and/or/not/select) plus the i1 ts type round-tripping through the
+    kunir → kungpu lowering.
+    """
+    print("=== cmp/logical/select: 8 outputs exercising kunir bool ops ===")
+    f = build_func_cmp_logical()
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(11)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+
+    out_names = ["gt_out", "lt_out", "ge_out", "le_out",
+                  "eq_out", "and_out", "or_out", "not_out"]
+    outs = {n: cp.zeros((T, S), dtype=cp.float32) for n in out_names}
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h), "b": cp.asarray(b_h), **outs})
+
+    def ref(cond: np.ndarray) -> np.ndarray:
+        return np.where(cond, a_h, b_h)
+
+    zero = np.zeros_like(a_h)
+    expected = {
+        "gt_out":  ref(a_h >  b_h),
+        "lt_out":  ref(a_h <  b_h),
+        "ge_out":  ref(a_h >= b_h),
+        "le_out":  ref(a_h <= b_h),
+        "eq_out":  ref(a_h == b_h),
+        "and_out": ref((a_h > zero) & (b_h > zero)),
+        "or_out":  ref((a_h > zero) | (b_h > zero)),
+        "not_out": ref(~(a_h > b_h)),
+    }
+
+    rc = 0
+    for n in out_names:
+        out_h = cp.asnumpy(outs[n])
+        if not np.allclose(out_h, expected[n], atol=1e-5):
+            diff = np.abs(out_h - expected[n])
+            idx  = np.unravel_index(int(np.nanargmax(diff)), diff.shape)
+            print(f"  FAIL {n} — max |Δ|={float(diff.max()):.3e} at {idx}",
+                    file=sys.stderr)
+            rc = 1
+        else:
+            print(f"  ok {n}")
+    if rc == 0:
+        print(f"  ok — all 8 outputs match across {T*S} cells")
+    return rc
+
+
 def run_windowed(target: str, T: int, S: int, N: int) -> int:
     print(f"=== windowed: ws = WindowedSum(a + b, N={N}) ===")
     f = build_func_windowed(N)
@@ -491,6 +589,8 @@ def main() -> int:
                                     N=20, mask=1)
     print()
     rc |= run_multipartition(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_cmp_logical(args.target, args.time_length, args.num_stocks)
     return rc
 
 

From 40be63f2f5318e010a8b7f02187173e4bb1c79d0 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 13 May 2026 23:12:32 -0700
Subject: [PATCH 24/59] refactor kunir-to-kungpu + test backwindow outer-ts
 read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LowerHelper struct splits tsMap (handles) from scalarMap (scalars).
getScalarUncached(v, offset) is the shared util for BackRefOp,
for_each_back_window block-arg pre-load and getScalar; getScalar
caches offset-0 reads in scalarMap. Both return FailureOr<Value>
threaded via KUN_ASSIGN_OR_FAIL.

test_kun_to_cuda.py adds ws_maxabs = max_k |c[t-k] - c[t]| to the
windowed test — body reads both the IterValue block-arg and outer ts
c, covering the inner-scope tsMap/scalarMap inheritance. Split
run_windowed into build_windowed + test_windowed so the N=window
build is shared across mask=0 / mask=3 runs; folded the old
run_windowed_with_mask into test_windowed(mask=...).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/lib/KunIr/KunIrToKunGpu.cpp     | 304 +++++++++++++++------------
 mlir/test/python/test_kun_to_cuda.py | 198 ++++++++---------
 2 files changed, 276 insertions(+), 226 deletions(-)

diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 5048b71..0c5740f 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -35,97 +35,135 @@ using namespace mlir;
 using namespace kunir;
 using namespace kungpu;
 
+// In a function returning LogicalResult / FailureOr<U>:
+//   KUN_ASSIGN_OR_FAIL(T x, callReturningFailureOrT(...));
+// On failure → `return failure();`.  On success → declare `x = *result`.
+// Multi-statement expansion; do not use without braces in if/while/for bodies.
+#define KUN_DETAIL_CAT_(a, b) a##b
+#define KUN_DETAIL_CAT(a, b)  KUN_DETAIL_CAT_(a, b)
+#define KUN_ASSIGN_OR_FAIL_IMPL(decl, expr, tmp)        \
+    auto tmp = (expr);                                  \
+    if (::mlir::failed(tmp)) return ::mlir::failure();  \
+    decl = *std::move(tmp)
+#define KUN_ASSIGN_OR_FAIL(decl, expr)                  \
+    KUN_ASSIGN_OR_FAIL_IMPL(decl, expr,                 \
+        KUN_DETAIL_CAT(_kunOrFail_, __COUNTER__))
+
 namespace {
 
 //===----------------------------------------------------------------------===//
-// Value tracking: is a kunir ts value a ts handle or a scalar?
+// Value tracking
+//
+// Two disjoint maps, both keyed by kunir SSA Values:
+//   tsMap     : ts SSA value  -> the ts handle SSA value to load from.
+//               Populated by: function-arg seeding, WindowedOutputOp
+//               (handle = windowed_temp), inner LowerHelper copy of outer.tsMap.
+//   scalarMap : SSA value     -> the scalar SSA value already materialised
+//               for it.  Populated by: arith op results, reduce accumulator
+//               pre-seed + update, BackRefOp result, for_each_back_window
+//               result, FastWindowedSumOp result, getScalar's offset-0 cache.
 //
-// HANDLE — the mapped Value is a ts memory object (!kunir.ts<*>); loading it
-//          via ts.get at a given time index yields the element scalar.
-// SCALAR — the mapped Value is an already-computed float scalar.
+// A given SSA value lives in at most one map at construction time, but a ts
+// handle can become "additionally" represented in scalarMap once it has been
+// read at offset 0 — that's the getScalar cache.
 //===----------------------------------------------------------------------===//
 
-enum class TsKind { Handle, Scalar };
-struct TsEntry { TsKind kind; Value value; };
-using TsMap = llvm::DenseMap<Value, TsEntry>;
+using HandleMap = llvm::DenseMap<Value, Value>;
+
+// One LowerHelper per scope (outer function body / for_each_back_window body).
+// `zeroOffsetI32` is the function-scope i32 zero constant created once before
+// the outer scf.for; all LowerHelper instances share it.
+struct LowerHelper {
+  HandleMap tsMap;
+  HandleMap scalarMap;
+  Value zeroOffsetI32;
+
+  // Shared util: look up `v` (a ts SSA value) in tsMap, emit
+  // ts.get(handle, offsetI32), return the loaded scalar.  Does NOT touch
+  // scalarMap — callers decide whether/where to cache the result.  Returns
+  // failure (with an in-flight diagnostic at `loc`) if `v` is not a
+  // registered ts handle.
+  //
+  // Used by:
+  //   - getScalar (offset = zeroOffsetI32)
+  //   - BackRefOp branch (offset = constant(window))
+  //   - for_each_back_window block-arg pre-load (offset = window-1-w)
+  FailureOr<Value> getScalarUncached(Value v, Value offsetI32,
+                                      OpBuilder &b, Location loc) {
+    auto it = tsMap.find(v);
+    if (it == tsMap.end())
+      return emitError(loc,
+          "kunir-to-kungpu: value is not a registered ts handle in tsMap");
+    auto tsTy = llvm::cast<TsType>(v.getType());
+    return b.create<TsGetOp>(loc, tsTy.getElementType(),
+                              it->second, offsetI32).getResult();
+  }
 
-// If `v` is mapped as a Handle in tsMap, emit ts.get(handle, offsetI32) and
-// promote the entry to Scalar.  Returns the scalar value.
-//
-// `offsetI32` is the tail-relative offset (i32):
-//   0 = latest (just put / current time step)
-//   k = k steps earlier
-static Value getScalar(Value v, TsMap &tsMap, Value offsetI32,
-                       OpBuilder &b, Location loc) {
-  auto it = tsMap.find(v);
-  assert(it != tsMap.end() && "value not found in tsMap");
-  if (it->second.kind == TsKind::Scalar)
-    return it->second.value;
-  auto tsTy = llvm::cast<TsType>(v.getType());
-  Value scalar = b.create<TsGetOp>(loc, tsTy.getElementType(),
-                                    it->second.value, offsetI32);
-  it->second = {TsKind::Scalar, scalar};
-  return scalar;
-}
+  // Offset-0 read with scalarMap caching.  Looks up scalarMap first; on miss
+  // loads at offset 0 via getScalarUncached and caches the result.  This is
+  // the standard "current time step" read used by all in-body operand
+  // lookups inside lowerBlock.
+  FailureOr<Value> getScalar(Value v, OpBuilder &b, Location loc) {
+    auto sit = scalarMap.find(v);
+    if (sit != scalarMap.end()) return sit->second;
+    KUN_ASSIGN_OR_FAIL(Value scalar,
+                       getScalarUncached(v, zeroOffsetI32, b, loc));
+    scalarMap[v] = scalar;
+    return scalar;
+  }
 
-// Lower non-terminator ops in `ops` in sequential (definition) order.
-//
-// For each op:
-//   - BinaryArithInterface: emit scalar binary op, record result as Scalar.
-//   - UnaryArithInterface:  emit scalar unary op,  record result as Scalar.
-//   - ReduceArithInterface: caller must pre-seed the op's result in tsMap with
-//     the current accumulator (iterArg).  This function emits the accumulation
-//     step and updates the tsMap entry to the new accumulator.
-//   - Anything else: call handleUnknown if provided, else return failure.
-//
-// Handle-typed operands are loaded via ts.get (getScalar) on first use.
-static LogicalResult lowerBlock(
-    llvm::ArrayRef<Operation *> ops,
-    TsMap &tsMap, Value offsetI32, OpBuilder &b, Location loc,
-    llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
-  for (Operation *op : ops) {
-    Location ol = op->getLoc();
-    if (auto iface = dyn_cast<BinaryArithInterface>(op)) {
-      Value lhs = getScalar(op->getOperand(0), tsMap, offsetI32, b, ol);
-      Value rhs = getScalar(op->getOperand(1), tsMap, offsetI32, b, ol);
-      tsMap[op->getResult(0)] = {TsKind::Scalar,
-          iface.buildScalarOp(b, ol, lhs, rhs)};
-    } else if (auto iface = dyn_cast<UnaryArithInterface>(op)) {
-      Value operand = getScalar(op->getOperand(0), tsMap, offsetI32, b, ol);
-      tsMap[op->getResult(0)] = {TsKind::Scalar,
-          iface.buildScalarOp(b, ol, operand)};
-    } else if (auto ri = dyn_cast<ReduceArithInterface>(op)) {
-      Value elem = getScalar(op->getOperand(0), tsMap, offsetI32, b, ol);
-      auto it = tsMap.find(op->getResult(0));
-      assert(it != tsMap.end() && it->second.kind == TsKind::Scalar
-             && "reduce result must be pre-seeded in tsMap with current acc");
-      it->second = {TsKind::Scalar,
-          ri.buildAccumOp(b, ol, it->second.value, elem)};
-    } else if (auto sel = dyn_cast<SelectOp>(op)) {
-      Value cond  = getScalar(sel.getCond(),       tsMap, offsetI32, b, ol);
-      Value tv    = getScalar(sel.getTrueValue(),  tsMap, offsetI32, b, ol);
-      Value fv    = getScalar(sel.getFalseValue(), tsMap, offsetI32, b, ol);
-      tsMap[sel.getResult()] = {TsKind::Scalar,
-          b.create<arith::SelectOp>(ol, cond, tv, fv).getResult()};
-    } else if (handleUnknown) {
-      if (failed(handleUnknown(*op))) return failure();
-    } else {
-      return op->emitError("kunir-to-kungpu: cannot lower op in block");
+  // Lower non-terminator ops in `ops` in definition order.
+  //
+  // For each op:
+  //   - Anything else: call handleUnknown if provided, else return failure.
+  LogicalResult lowerBlock(
+      llvm::ArrayRef<Operation *> ops, OpBuilder &b,
+      llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
+    for (Operation *op : ops) {
+      Location ol = op->getLoc();
+      if (auto iface = dyn_cast<BinaryArithInterface>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value lhs, getScalar(op->getOperand(0), b, ol));
+        KUN_ASSIGN_OR_FAIL(Value rhs, getScalar(op->getOperand(1), b, ol));
+        scalarMap[op->getResult(0)] = iface.buildScalarOp(b, ol, lhs, rhs);
+      } else if (auto iface = dyn_cast<UnaryArithInterface>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value operand, getScalar(op->getOperand(0), b, ol));
+        scalarMap[op->getResult(0)] = iface.buildScalarOp(b, ol, operand);
+      } else if (auto ri = dyn_cast<ReduceArithInterface>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value elem, getScalar(op->getOperand(0), b, ol));
+        auto it = scalarMap.find(op->getResult(0));
+        assert(it != scalarMap.end() &&
+               "reduce result must be pre-seeded in scalarMap with current acc");
+        it->second = ri.buildAccumOp(b, ol, it->second, elem);
+      } else if (auto sel = dyn_cast<SelectOp>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value cond, getScalar(sel.getCond(),      b, ol));
+        KUN_ASSIGN_OR_FAIL(Value tv,   getScalar(sel.getTrueValue(), b, ol));
+        KUN_ASSIGN_OR_FAIL(Value fv,   getScalar(sel.getFalseValue(),b, ol));
+        scalarMap[sel.getResult()] =
+            b.create<arith::SelectOp>(ol, cond, tv, fv).getResult();
+      } else if (auto br = dyn_cast<BackRefOp>(op)) {
+        Value offset = b.create<arith::ConstantOp>(
+            ol, b.getI32Type(), b.getI32IntegerAttr(br.getWindow()));
+        KUN_ASSIGN_OR_FAIL(Value scalar,
+            getScalarUncached(br.getInput(), offset, b, ol));
+        scalarMap[br.getResult()] = scalar;
+      } else if (handleUnknown) {
+        if (failed(handleUnknown(*op))) return failure();
+      } else {
+        return op->emitError("kunir-to-kungpu: cannot lower op in block");
+      }
     }
+    return success();
   }
-  return success();
-}
 
-// Overload that collects non-terminator ops from `block` and delegates.
-static LogicalResult lowerBlock(
-    Block &block,
-    TsMap &tsMap, Value offsetI32, OpBuilder &b, Location loc,
-    llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
-  SmallVector<Operation *> ops;
-  for (Operation &op : block.without_terminator())
-    ops.push_back(&op);
-  return lowerBlock(ops, tsMap, offsetI32, b, loc, handleUnknown);
-}
+  LogicalResult lowerBlock(
+      Block &block, OpBuilder &b,
+      llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
+    SmallVector<Operation *> ops;
+    for (Operation &op : block.without_terminator())
+      ops.push_back(&op);
+    return lowerBlock(ops, b, handleUnknown);
+  }
+};
 
 //===----------------------------------------------------------------------===//
 // Pass definition
@@ -224,21 +262,24 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   b.setInsertionPoint(outerFor);
 
   // ------------------------------------------------------------------
-  // 4. Seed tsMap: each ts-typed function argument is a Handle.
+  // 4. Build outer LowerHelper; seed each ts-typed function argument into tsMap.
   // ------------------------------------------------------------------
-  TsMap tsMap;
+  LowerHelper outer;
+  outer.zeroOffsetI32 = zeroOffsetI32;
   unsigned numOrigArgs = oldFT.getNumInputs();
   for (unsigned i = 0; i < numOrigArgs; ++i) {
     Value arg = entry.getArgument(i);
     if (isa<TsType>(arg.getType()))
-      tsMap[arg] = {TsKind::Handle, arg};
+      outer.tsMap[arg] = arg;
   }
 
   // ------------------------------------------------------------------
   // 5. Lower original ops in definition order.
   //
-  //    lowerBlock handles binary/unary/reduce ops.  windowed_output,
-  //    for_each_back_window, and func.return are handled by the callback.
+  //    LowerHelper::lowerBlock handles binary/unary/reduce/select/back_ref ops.
+  //    windowed_output, for_each_back_window, fast_windowed_sum, and
+  //    func.return are outer-scope only and are handled by the callback
+  //    below.
   // ------------------------------------------------------------------
   auto outerHandler = [&](Operation &op) -> LogicalResult {
     if (isa<kunir::ReturnOp>(op)) return success(); // handled in step 7
@@ -249,9 +290,9 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     //                   fill circular buffer at each time step inside.
     if (auto woOp = dyn_cast<WindowedOutputOp>(op)) {
       auto wt = b.create<WindowedTempOp>(ol, woOp.getResult().getType());
-      tsMap[woOp.getResult()] = {TsKind::Handle, wt.getResult()};
-      Value inputScalar =
-          getScalar(woOp.getInput(), tsMap, zeroOffsetI32, fb, ol);
+      outer.tsMap[woOp.getResult()] = wt.getResult();
+      KUN_ASSIGN_OR_FAIL(Value inputScalar,
+                         outer.getScalar(woOp.getInput(), fb, ol));
       fb.create<TsPutOp>(ol, wt.getResult(), inputScalar);
       return success();
     }
@@ -262,19 +303,14 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       Block &body = fwOp.getBody().front();
       auto yieldOp = llvm::cast<YieldOp>(body.getTerminator());
 
-      // Resolve inputs to ts handles.
-      SmallVector<Value> inputHandles(fwOp.getInputs().size());
-      for (auto [i, inp] : llvm::enumerate(fwOp.getInputs())) {
-        auto it = tsMap.find(inp);
-        if (it == tsMap.end() || it->second.kind != TsKind::Handle) {
+      // Verify all inputs are ts handles (already in outer.tsMap).
+      for (Value inp : fwOp.getInputs()) {
+        if (!outer.tsMap.count(inp))
           return op.emitError("kunir-to-kungpu: for_each_back_window input "
                               "must be a ts handle");
-        }
-        inputHandles[i] = it->second.value;
       }
 
       // Each yield operand must come from a reduce_* op — collect init values.
-      SmallVector<ReduceArithInterface> reduces;
       SmallVector<Value> initVals;
       for (Value yv : yieldOp.getValues()) {
         auto *defOp = yv.getDefiningOp();
@@ -287,7 +323,6 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
         auto elemTy = llvm::cast<FloatType>(
             llvm::cast<TsType>(defOp->getOperand(0).getType()).getElementType());
         initVals.push_back(fb.create<arith::ConstantOp>(ol, ri.getInitValue(elemTy)));
-        reduces.push_back(ri);
       }
 
       // Create inner scf.for %w = 0 to window step 1 iter_args(acc_i = init_i).
@@ -306,18 +341,37 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
             // to window-1 reads oldest-to-newest, i.e. offset = window-1-w.
             Value w_i32 =
                 ib.create<arith::IndexCastOp>(il, ib.getI32Type(), w);
-            Value offsetI32 =
+            Value windowedOffset =
                 ib.create<arith::SubIOp>(il, wM1_i32, w_i32);
 
-            // Seed innerTsMap: block args as handles; reduce results as acc.
-            TsMap innerTsMap;
-            for (auto [i, arg] : llvm::enumerate(body.getArguments()))
-              innerTsMap[arg] = {TsKind::Handle, inputHandles[i]};
+            // Inner LowerHelper inherits the outer tsMap/scalarMap so reads
+            // inside the body can still reach outer-scope handles (e.g. a
+            // back_ref placed in the body) and outer-scope scalars, and
+            // shares the function-scope zero-offset constant.
+            //
+            // Pre-loads, written directly into inner.scalarMap (bypassing
+            // the offset-0 cache since these reads are at non-zero offsets):
+            //   - Each block arg = its corresponding ts input loaded at the
+            //     windowed offset (via getScalarUncached).
+            //   - Each reduce result = the matching iter_arg accumulator.
+            //
+            // After this setup the body has no non-zero-offset reads left;
+            // lowerBlock just uses offset 0 + scalarMap for everything.
+            LowerHelper inner{outer.tsMap, outer.scalarMap, outer.zeroOffsetI32};
+            for (auto [i, arg] : llvm::enumerate(body.getArguments())) {
+              auto r = inner.getScalarUncached(fwOp.getInputs()[i],
+                                                windowedOffset, ib, il);
+              if (failed(r)) {
+                innerOk = false;
+                ib.create<scf::YieldOp>(il, initVals);
+                return;
+              }
+              inner.scalarMap[arg] = *r;
+            }
             for (auto [i, yv] : llvm::enumerate(yieldOp.getValues()))
-              innerTsMap[yv.getDefiningOp()->getResult(0)] = {TsKind::Scalar,
-                                                              iterArgs[i]};
+              inner.scalarMap[yv] = iterArgs[i];
 
-            if (failed(lowerBlock(body, innerTsMap, offsetI32, ib, il))) {
+            if (failed(inner.lowerBlock(body, ib))) {
               innerOk = false;
               ib.create<scf::YieldOp>(il, initVals); // keep IR structurally valid
               return;
@@ -325,30 +379,15 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
 
             SmallVector<Value> newAccs;
             for (Value yv : yieldOp.getValues())
-              newAccs.push_back(innerTsMap.find(yv)->second.value);
+              newAccs.push_back(inner.scalarMap.find(yv)->second);
             ib.create<scf::YieldOp>(il, newAccs);
           });
       if (!innerOk) return failure();
 
-      // Map for_each_back_window results to the inner for's results.
+      // Map for_each_back_window results (scalar reduce accs) to the inner
+      // for's results.
       for (auto [i, res] : llvm::enumerate(fwOp.getResults()))
-        tsMap[res] = {TsKind::Scalar, innerFor.getResult(i)};
-      return success();
-    }
-
-    // back_ref → ts.get(handle, offset = window).  Stateless, so we can
-    // fully lower here (the op does not survive into the kungpu IR).
-    if (auto br = dyn_cast<BackRefOp>(op)) {
-      auto inputTs = llvm::cast<TsType>(br.getInput().getType());
-      auto inputIt = tsMap.find(br.getInput());
-      if (inputIt == tsMap.end() || inputIt->second.kind != TsKind::Handle)
-        return op.emitError(
-            "kunir-to-kungpu: back_ref input must be a ts handle");
-      Value offset = fb.create<arith::ConstantOp>(
-          ol, fb.getI32Type(), fb.getI32IntegerAttr(br.getWindow()));
-      Value scalar = fb.create<TsGetOp>(ol, inputTs.getElementType(),
-                                          inputIt->second.value, offset);
-      tsMap[br.getResult()] = {TsKind::Scalar, scalar};
+        outer.scalarMap[res] = innerFor.getResult(i);
       return success();
     }
 
@@ -357,30 +396,31 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     // (per-thread state allocas + the Kahan-corrected step).
     if (auto fws = dyn_cast<FastWindowedSumOp>(op)) {
       auto inputTs = llvm::cast<TsType>(fws.getInput().getType());
-      auto inputIt = tsMap.find(fws.getInput());
-      if (inputIt == tsMap.end() || inputIt->second.kind != TsKind::Handle)
+      auto inputIt = outer.tsMap.find(fws.getInput());
+      if (inputIt == outer.tsMap.end())
         return op.emitError(
             "kunir-to-kungpu: fast_windowed_sum input must be a ts handle");
       auto newOp = fb.create<FastWindowedSumOp>(
           ol, /*resultType=*/inputTs.getElementType(),
-          /*input=*/inputIt->second.value, fws.getWindowAttr());
-      tsMap[fws.getResult()] = {TsKind::Scalar, newOp.getResult()};
+          /*input=*/inputIt->second, fws.getWindowAttr());
+      outer.scalarMap[fws.getResult()] = newOp.getResult();
       return success();
     }
 
     return op.emitError("kunir-to-kungpu: unhandled op in outer block");
   };
 
-  if (failed(lowerBlock(origOps, tsMap, zeroOffsetI32, fb, loc, outerHandler)))
+  if (failed(outer.lowerBlock(origOps, fb, outerHandler)))
     return signalPassFailure();
 
   // ------------------------------------------------------------------
   // 6. Emit ts.put for each ts return value, then close the outer for.
   // ------------------------------------------------------------------
   for (auto [outParam, rv] : llvm::zip(outParams, tsRetVals)) {
-    auto it = tsMap.find(rv);
-    assert(it != tsMap.end() && it->second.kind == TsKind::Scalar);
-    fb.create<TsPutOp>(loc, outParam, it->second.value);
+    auto it = outer.scalarMap.find(rv);
+    assert(it != outer.scalarMap.end() &&
+           "ts return value not materialised as a scalar");
+    fb.create<TsPutOp>(loc, outParam, it->second);
   }
   fb.create<scf::YieldOp>(loc);
 
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 64dc9f3..b27bdf2 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -26,8 +26,11 @@
 
 import numpy as np
 
-from KunQuant.Op import Builder, Input, Output
-from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum
+from KunQuant.Op import (
+    Builder, Input, Output,
+    WindowedTempOutput, ForeachBackWindow, IterValue,
+)
+from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum, ReduceMax
 from KunQuant.ops.ElewiseOp import (
     GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
     And, Or, Not, Select,
@@ -35,7 +38,7 @@
 from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
 from KunQuant.Stage import Function
 from KunQuant.jit import KunMLIR
-from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
+from KunQuant.jit.cuda import compileit, CudaCompilerConfig
 
 
 def build_func_elemwise() -> Function:
@@ -62,13 +65,31 @@ def build_func_libdevice() -> Function:
 
 
 def build_func_windowed(N: int) -> Function:
-    """ws = WindowedSum(a + b, N)"""
+    """Two outputs over c = a + b:
+       ws        = WindowedSum(c, N)
+       ws_maxabs = max_{k in [0..N-1]} |c[t-k] - c[t]|
+
+    `ws_maxabs` is a hand-built ForeachBackWindow whose body reads BOTH:
+      - the block-arg  (= c[t-k], the iter value)
+      - the outer ts c (= c[t], current time step)
+    and reduces |·| via ReduceMax.  This exercises the kunir-to-kungpu
+    inner-scope inheritance of the outer scalarMap/tsMap: `c` is computed
+    outside the loop but used inside.
+    """
     builder = Builder()
     with builder:
         a = Input("a")
         bin_ = Input("b")
-        s = WindowedSum(Add(a, bin_), N)
-        Output(s, "ws")
+        c = Add(a, bin_)
+        Output(WindowedSum(c, N), "ws")
+
+        wtemp = WindowedTempOutput(c, N)
+        loop  = ForeachBackWindow(wtemp, N)
+        builder.set_loop(loop)
+        diff = Sub(IterValue(loop, wtemp), c)
+        a_diff = Abs(diff)
+        builder.set_loop(None)
+        Output(ReduceMax(a_diff), "ws_maxabs")
     return Function(builder.ops, name="windowed_kernel")
 
 
@@ -176,10 +197,6 @@ def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
     f = build_fn()
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    mod = to_mlir(build_fn(), cfg)
-    print("--- mlir ---")
-    print(mod.to_string())
-
     exe = compileit(f, cfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
@@ -226,10 +243,6 @@ def run_backref(target: str, T: int, S: int, N: int) -> int:
     f = build_func_backref(N)
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    mod = to_mlir(build_func_backref(N), cfg)
-    print("--- mlir ---")
-    print(mod.to_string())
-
     exe = compileit(f, cfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
@@ -262,10 +275,6 @@ def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
     f = build_func_fastwindowedsum(N)
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    mod = to_mlir(build_func_fastwindowedsum(N), cfg)
-    print("--- mlir ---")
-    print(mod.to_string())
-
     exe = compileit(f, cfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
@@ -305,10 +314,6 @@ def run_multipartition(target: str, T: int, S: int) -> int:
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
                               partition_factor=1)
 
-    mod = to_mlir(build_func_multipartition(), cfg)
-    print("--- mlir ---")
-    print(mod.to_string())
-
     exe = compileit(f, cfg)
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
@@ -410,40 +415,86 @@ def ref(cond: np.ndarray) -> np.ndarray:
     return rc
 
 
-def run_windowed(target: str, T: int, S: int, N: int) -> int:
-    print(f"=== windowed: ws = WindowedSum(a + b, N={N}) ===")
+def build_windowed(target: str, N: int):
+    """Compile `build_func_windowed(N)` once.  The returned executable
+    can be reused across multiple `test_windowed` invocations with
+    different T / S / mask (anything that doesn't change the graph
+    topology or window size N)."""
     f = build_func_windowed(N)
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
-
-    mod = to_mlir(build_func_windowed(N), cfg)
-    print("--- mlir ---")
-    print(mod.to_string())
-
     exe = compileit(f, cfg)
-    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+    print(f"  [build windowed N={N}] kernels={exe.kernel_names}  "
+           f"num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+    return exe
+
+
+def test_windowed(exe, T: int, S: int, N: int, mask: int = 0) -> int:
+    """Correctness check against numpy for the two outputs of
+    `build_func_windowed`:
+       ws        = WindowedSum(c, N)             — stateful fast_windowed_sum
+       ws_maxabs = max_k |c[t-k] - c[t]|         — hand-built ForeachBackWindow
+                                                    body that reads BOTH the
+                                                    block-arg (c[t-k]) AND the
+                                                    outer ts c (c[t]).
+       (c = a + b, k in [0..N-1])
+
+    With `mask > 0` the output time dim shrinks by `mask` and the kernel
+    runs with that mask — exercises the multi-chunk + mask path
+    (chunk-local `t - loop_lb >= window` guard) for both outputs.
+
+    `exe` must have been compiled with the matching `N`.
+    """
+    assert 0 <= mask < T
+    mask_tag = f", mask={mask}" if mask else ""
+    print(f"=== windowed: ws = WindowedSum(a + b, N={N}){mask_tag}; "
+           f"ws_maxabs = max_k |c[t-k] - c[t]|  (c = a+b) ===")
 
     import cupy as cp
     rng = np.random.default_rng(1)
     a_h = rng.standard_normal((T, S), dtype=np.float32)
     b_h = rng.standard_normal((T, S), dtype=np.float32)
-    out = cp.zeros((T, S), dtype=cp.float32)
+    out_T      = T - mask
+    ws_out     = cp.zeros((out_T, S), dtype=cp.float32)
+    maxabs_out = cp.zeros((out_T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h),
-                              "b": cp.asarray(b_h), "ws": out})
-    out_h = cp.asnumpy(out)
-
+    inputs = {"a": cp.asarray(a_h), "b": cp.asarray(b_h),
+              "ws": ws_out, "ws_maxabs": maxabs_out}
+    if mask:
+        executor.runGraph(exe, inputs, mask=mask)
+    else:
+        executor.runGraph(exe, inputs)
+    ws_h     = cp.asnumpy(ws_out)
+    maxabs_h = cp.asnumpy(maxabs_out)
+
+    # Build full-T references, then slice from `mask` onward (no-op when
+    # mask == 0).  Output row i ↔ input time i+mask; reliable when
+    # i + mask >= N - 1.
     c = a_h + b_h
     cumsum = np.cumsum(c, axis=0, dtype=np.float64)
-    expected = np.empty((T, S), dtype=np.float32)
-    expected[:N - 1] = np.nan
-    expected[N - 1] = cumsum[N - 1]
+    ws_full = np.empty((T, S), dtype=np.float32)
+    ws_full[:N - 1] = np.nan
+    ws_full[N - 1] = cumsum[N - 1]
     if T > N:
-        expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+        ws_full[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+    ws_expected = ws_full[mask:]
 
-    return _compare_post_warmup(out_h, expected, valid_start=N - 1,
+    maxabs_full = np.empty((T, S), dtype=np.float32)
+    maxabs_full[:N - 1] = np.nan
+    for t in range(N - 1, T):
+        window = c[t - N + 1 : t + 1]                     # (N, S)
+        maxabs_full[t] = np.max(np.abs(window - c[t]), axis=0)
+    maxabs_expected = maxabs_full[mask:]
+
+    valid_start = max(0, N - 1 - mask)
+    rc = 0
+    rc |= _compare_post_warmup(ws_h, ws_expected,
+                                  valid_start=valid_start,
                                   atol=max(1e-3, 5e-7 * N))
+    rc |= _compare_post_warmup(maxabs_h, maxabs_expected,
+                                  valid_start=valid_start, atol=1e-5)
+    return rc
 
 
 def run_backref_with_mask(target: str, T: int, S: int, N: int,
@@ -493,54 +544,6 @@ def run_backref_with_mask(target: str, T: int, S: int, N: int,
                                   valid_start=valid_start, atol=1e-5)
 
 
-def run_windowed_with_mask(target: str, T: int, S: int, N: int,
-                              mask: int) -> int:
-    """`WindowedSum(a + b, N)` driven with mask — same graph as
-    `run_windowed`, but exercises the stateful `fast_windowed_sum`
-    lowering across multi-chunk + mask.  After the chunk-local guard
-    fix (`t - loop_lb ≥ window`), each chunk's per-CTA state primes
-    correctly through its warmup overlap and the post-warmup tail
-    matches the CPU reference at float-precision noise.
-    """
-    print(f"=== windowed + mask: ws = WindowedSum(a + b, N={N}), "
-           f"mask={mask} ===")
-    assert 0 < mask < T, "test requires 0 < mask < T"
-    f = build_func_windowed(N)
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
-
-    exe = compileit(f, cfg)
-    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
-           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
-
-    import cupy as cp
-    rng = np.random.default_rng(5)
-    a_h = rng.standard_normal((T, S), dtype=np.float32)
-    b_h = rng.standard_normal((T, S), dtype=np.float32)
-    out = cp.zeros((T - mask, S), dtype=cp.float32)
-
-    executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h),
-                              "b": cp.asarray(b_h), "ws": out},
-                       mask=mask)
-    out_h = cp.asnumpy(out)
-
-    # Full-T reference, then slice from `mask` onward to align with
-    # the output's input-time origin.  Output row i = input time i+mask;
-    # reliable when i + mask ≥ N - 1.
-    c = a_h + b_h
-    cumsum = np.cumsum(c, axis=0, dtype=np.float64)
-    expected_full = np.empty((T, S), dtype=np.float32)
-    expected_full[:N - 1] = np.nan
-    expected_full[N - 1] = cumsum[N - 1]
-    if T > N:
-        expected_full[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
-    expected = expected_full[mask:]
-    valid_start = max(0, N - 1 - mask)
-    return _compare_post_warmup(out_h, expected,
-                                  valid_start=valid_start,
-                                  atol=max(1e-3, 5e-7 * N))
-
-
 def main() -> int:
     ap = argparse.ArgumentParser()
     ap.add_argument("--target", default="sm_120")
@@ -562,7 +565,12 @@ def main() -> int:
     print()
     rc |= run_libdevice(args.target, args.time_length, args.num_stocks)
     print()
-    rc |= run_windowed(args.target, args.time_length, args.num_stocks, args.window)
+    # Build once for N=args.window, reuse across the mask=0 and mask=3
+    # validations (graph topology + window size are the same; only T/S/mask
+    # differ at run time).
+    windowed_exe = build_windowed(args.target, args.window)
+    rc |= test_windowed(windowed_exe, args.time_length, args.num_stocks,
+                          args.window)
     print()
     rc |= run_backref(args.target, args.time_length, args.num_stocks, args.window)
     print()
@@ -574,8 +582,8 @@ def main() -> int:
     rc |= run_backref_with_mask(args.target, args.time_length, args.num_stocks,
                                   args.window, mask=3)
     print()
-    rc |= run_windowed_with_mask(args.target, args.time_length, args.num_stocks,
-                                    args.window, mask=3)
+    rc |= test_windowed(windowed_exe, args.time_length, args.num_stocks,
+                          args.window, mask=3)
     print()
     rc |= run_fastwindowedsum(args.target, args.time_length, args.num_stocks,
                                 args.window)
@@ -584,9 +592,11 @@ def main() -> int:
     # that `cap_warmup = T/(K*N) = 64/(4*20) = 0` clamps num_chunks to 1.
     # Exercises the multi-chunk kernel binary in its degenerate
     # grid_y=1 launch configuration — guards against regressions in
-    # time_lb / time_ub / write-gating when `chunk_size = T`.
-    rc |= run_windowed_with_mask(args.target, T=64, S=args.num_stocks,
-                                    N=20, mask=1)
+    # time_lb / time_ub / write-gating when `chunk_size = T`.  Different
+    # N → fresh build.
+    windowed_exe_n20 = build_windowed(args.target, N=20)
+    rc |= test_windowed(windowed_exe_n20, T=64, S=args.num_stocks,
+                          N=20, mask=1)
     print()
     rc |= run_multipartition(args.target, args.time_length, args.num_stocks)
     print()

From 228b0624a0cfad9588d5a04e765a88140048feb0 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Wed, 13 May 2026 23:44:23 -0700
Subject: [PATCH 25/59] Add constant op

---
 KunQuant/passes/CodegenMLIR.py       | 51 ++++++++++++++++++++--------
 mlir/include/KunIr/KunIrOps.td       | 17 ++++++++++
 mlir/lib/KunIr/KunIrOps.cpp          | 13 +++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp     | 13 +++++++
 mlir/lib/Python/IRBuilder.cpp        | 11 ++++++
 mlir/test/python/test_kun_to_cuda.py |  8 ++---
 6 files changed, 93 insertions(+), 20 deletions(-)

diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 77f8a59..384da50 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -18,11 +18,17 @@
 """
 
 from __future__ import annotations
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # KunMLIR is a compiled extension built alongside the MLIR support,
+    # only imported here for type checking — no runtime dependency added
+    # to the codegen path itself.
+    from KunQuant.jit import KunMLIR
 
 from KunQuant.Op import (
     OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
-    ReductionOp, SimpleCrossSectionalOp,
+    ReductionOp, SimpleCrossSectionalOp, ConstantOp,
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Sign,
@@ -109,8 +115,13 @@ def _index_loop_members(f: Function) -> Tuple[
     return body_ops, reductions
 
 
-def _emit_simple(op: OpBase, ir, val_map: Dict[OpBase, object]):
-    """Emit a non-control-flow op via IRBuilder dispatch."""
+def _emit_simple(op: OpBase,
+                  ir: KunMLIR.IRBuilder,
+                  val_map: Dict[OpBase, KunMLIR.Value],
+                  ts_1: KunMLIR.Type) -> KunMLIR.Value:
+    """Emit a non-control-flow op via IRBuilder dispatch.  `ts_1` is the
+    kunir ts type with maxLookback=1, used by ops whose result has no
+    input to infer the element type from (currently only ConstantOp)."""
     cls = type(op)
     if cls in _BINARY:
         getattr(ir, _BINARY[cls])
@@ -130,12 +141,18 @@ def _emit_simple(op: OpBase, ir, val_map: Dict[OpBase, object]):
         return ir.select(val_map[op.inputs[0]],
                           val_map[op.inputs[1]],
                           val_map[op.inputs[2]])
+    if isinstance(op, ConstantOp):
+        v = op.attrs["value"]
+        fv = float("nan") if v == "nan" else float(v)
+        return ir.constant(fv, ts_1)
     raise NotImplementedError(
         f"CodegenMLIR: op type {cls.__name__} is not supported by the "
         f"GPU backend yet (op = {op})")
 
 
-def _emit_reduction(op: ReductionOp, ir, val_map: Dict[OpBase, object]):
+def _emit_reduction(op: ReductionOp,
+                     ir: KunMLIR.IRBuilder,
+                     val_map: Dict[OpBase, KunMLIR.Value]) -> KunMLIR.Value:
     cls = type(op)
     if cls not in _REDUCE:
         raise NotImplementedError(
@@ -150,7 +167,7 @@ def _emit_reduction(op: ReductionOp, ir, val_map: Dict[OpBase, object]):
 
 # ── Main entry point ────────────────────────────────────────────────
 
-def _maybe_external_partition(f: Function, dtype: str):
+def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
     """If `f` is a partition the GPU runtime handles as a pre-compiled
     external kernel (bundled PTX loaded as a separate CUmodule), return
     a descriptor dict that KunMLIR.compile() should append to the
@@ -192,8 +209,10 @@ def _maybe_external_partition(f: Function, dtype: str):
     }
 
 
-def translate_function(f: Function, target: TargetSpec, ir,
-                        dtype: str = "f32", unreliable_count: int = 0):
+def translate_function(f: Function, target: TargetSpec,
+                        ir: KunMLIR.IRBuilder,
+                        dtype: str = "f32",
+                        unreliable_count: int = 0) -> Optional[dict]:
     """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
 
     If `f` is an externally-dispatched partition (e.g. a single cs_rank
@@ -242,7 +261,7 @@ def translate_function(f: Function, target: TargetSpec, ir,
         result_types=[ts_1] * len(outputs),
     )
 
-    val_map: Dict[OpBase, object] = {}
+    val_map: Dict[OpBase, KunMLIR.Value] = {}
     emitted = set()
     for inp, val in zip(inputs, func_args):
         val_map[inp] = val
@@ -270,7 +289,7 @@ def translate_function(f: Function, target: TargetSpec, ir,
             raise RuntimeError(
                 f"CodegenMLIR: reduction/body op visited before its "
                 f"enclosing loop ({op})")
-        val_map[op] = _emit_simple(op, ir, val_map)
+        val_map[op] = _emit_simple(op, ir, val_map, ts_1)
         emitted.add(op)
 
     # 5.  Close the function with Outputs in declared order.
@@ -279,9 +298,13 @@ def translate_function(f: Function, target: TargetSpec, ir,
     return None
 
 
-def _emit_loop(loop: ForeachBackWindow, ir, val_map, ts_1,
-                body_ops: List[OpBase], reductions: List[ReductionOp],
-                emitted: set):
+def _emit_loop(loop: ForeachBackWindow,
+                ir: KunMLIR.IRBuilder,
+                val_map: Dict[OpBase, KunMLIR.Value],
+                ts_1: KunMLIR.Type,
+                body_ops: List[OpBase],
+                reductions: List[ReductionOp],
+                emitted: set) -> None:
     loop_input_vals = [val_map[i] for i in loop.inputs]
     n_results = len(reductions)
     if n_results == 0:
@@ -304,7 +327,7 @@ def _emit_loop(loop: ForeachBackWindow, ir, val_map, ts_1,
         if isinstance(body_op, IterValue):
             val_map[body_op] = block_arg_by_src[body_op.inputs[1]]
         else:
-            val_map[body_op] = _emit_simple(body_op, ir, val_map)
+            val_map[body_op] = _emit_simple(body_op, ir, val_map, ts_1)
         emitted.add(body_op)
 
     # Reductions accumulate yield values, in topo order.
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index b03d937..2f6cd30 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -162,6 +162,23 @@ def KunIr_SelectOp : KunIr_Op<"select", [
     "type($true_value) `,` type($false_value) attr-dict";
 }
 
+//===----------------------------------------------------------------------===//
+// Constant op
+//
+// A scalar value lifted into a ts<T, 1> at every time step.  The value
+// attribute is f64; kunir-to-kungpu converts it to an arith.constant of
+// the result's element type (f32, f64, i1, ...).  Pass NaN by storing
+// `0x7FF8000000000000` (the quiet-NaN bit pattern) into the f64 attr.
+//===----------------------------------------------------------------------===//
+
+def KunIr_ConstantOp : KunIr_Op<"constant", [Pure]> {
+  let summary = "Scalar constant broadcast to every time step";
+  let arguments = (ins F64Attr:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($result) attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // Cross-sectional ops
 //
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 0c3719e..63a0522 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -219,6 +219,19 @@ LogicalResult FastWindowedSumOp::verify() {
                                             getResult().getType());
 }
 
+//===----------------------------------------------------------------------===//
+// ConstantOp — result must be ts<T, 1>.  The value attr is f64; we don't
+// pre-check finiteness so that quiet-NaN (0x7FF8...) can flow through.
+//===----------------------------------------------------------------------===//
+
+LogicalResult ConstantOp::verify() {
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+  if (resultTy.getMaxLookback() != 1)
+    return emitOpError("result maxLookback must be 1, got ")
+           << resultTy.getMaxLookback();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ForEachBackWindowOp — verifier + custom assembly format
 //
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 0c5740f..42517eb 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -146,6 +146,19 @@ struct LowerHelper {
         KUN_ASSIGN_OR_FAIL(Value scalar,
             getScalarUncached(br.getInput(), offset, b, ol));
         scalarMap[br.getResult()] = scalar;
+      } else if (auto co = dyn_cast<ConstantOp>(op)) {
+        auto resTs = llvm::cast<TsType>(co.getResult().getType());
+        Type elemTy = resTs.getElementType();
+        // The op carries an f64 attribute; convert to the element type
+        // (f32 / f64) so arith.constant gets a type-matching attribute.
+        llvm::APFloat apv(co.getValue());
+        if (auto ft = llvm::dyn_cast<FloatType>(elemTy)) {
+          bool losesInfo = false;
+          apv.convert(ft.getFloatSemantics(),
+                      llvm::APFloat::rmNearestTiesToEven, &losesInfo);
+        }
+        scalarMap[co.getResult()] = b.create<arith::ConstantOp>(
+            ol, elemTy, b.getFloatAttr(elemTy, apv));
       } else if (handleUnknown) {
         if (failed(handleUnknown(*op))) return failure();
       } else {
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 570382e..08efd51 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -165,6 +165,12 @@ class IRBuilder {
     return b_.create<kunir::SelectOp>(b_.getUnknownLoc(), cond, tv, fv);
   }
 
+  // ── Scalar constant lifted to ts<T, 1> ─────────────────────────
+  Value constantOp(double value, Type tsTy) {
+    auto attr = b_.getF64FloatAttr(value);
+    return b_.create<kunir::ConstantOp>(b_.getUnknownLoc(), tsTy, attr);
+  }
+
   // ── Windowed buffer materialization ───────────────────────────────
   Value windowedOutputOp(Value x, int64_t length) {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
@@ -352,6 +358,11 @@ void registerIRBuilder(nb::module_ &m) {
       .def("not_",   &IRBuilder::notOp,    nb::arg("x"))
 
       // Select: cond ? true_value : false_value
+      .def("constant", &IRBuilder::constantOp,
+            nb::arg("value"), nb::arg("type"),
+            "Build a kunir.constant of element-type matching `type` (a "
+            "ts<T, 1>).  Pass float('nan') for NaN.")
+
       .def("select", &IRBuilder::selectOp,
             nb::arg("cond"), nb::arg("true_value"), nb::arg("false_value"))
 
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index b27bdf2..7aa2930 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -27,7 +27,7 @@
 import numpy as np
 
 from KunQuant.Op import (
-    Builder, Input, Output,
+    Builder, Input, Output, ConstantOp,
     WindowedTempOutput, ForeachBackWindow, IterValue,
 )
 from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum, ReduceMax
@@ -127,16 +127,12 @@ def build_func_cmp_logical() -> Function:
       and_out = (a > 0)  & (b > 0)  ? a : b # gt + and
       or_out  = (a > 0)  | (b > 0)  ? a : b # gt + or
       not_out = !(a > b) ? a : b            # = (a <= b) ? a : b
-
-    Constants 0 are produced via `Sub(a, a)`-style identities to stay
-    inside the ops supported by CodegenMLIR (no ConstantOp on the GPU
-    path yet — and we don't need one for this test).
     """
     builder = Builder()
     with builder:
         a = Input("a")
         bin_ = Input("b")
-        zero = Sub(a, a)  # = 0 elementwise (avoids ConstantOp dependency)
+        zero = ConstantOp(0)
         Output(Select(GreaterThan(a, bin_), a, bin_), "gt_out")
         Output(Select(LessThan(a, bin_),    a, bin_), "lt_out")
         Output(Select(GreaterEqual(a, bin_), a, bin_), "ge_out")

From aea24d39ce1fbffa8d142e98badad839cb0bd73a Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 14 May 2026 01:24:01 -0700
Subject: [PATCH 26/59] accumulator

---
 KunQuant/ops/MiscOp.py               |  8 ++-
 KunQuant/passes/CodegenMLIR.py       | 26 +++++++-
 mlir/include/KunGpu/KunGpuOps.h      |  1 +
 mlir/include/KunGpu/KunGpuOps.td     | 17 ++++-
 mlir/include/KunIr/KunIrOps.td       | 41 ++++++++++++
 mlir/lib/KunGpu/KunGpuOps.cpp        | 11 ++--
 mlir/lib/KunGpu/KunGpuToLLVM.cpp     | 85 ++++++++++++++++++++++--
 mlir/lib/KunIr/KunIrOps.cpp          | 34 ++++++++++
 mlir/lib/KunIr/KunIrToKunGpu.cpp     | 99 ++++++++++++++++++++++++++--
 mlir/lib/Python/IRBuilder.cpp        | 18 +++++
 mlir/test/python/test_kun_to_cuda.py | 65 +++++++++++++++++-
 11 files changed, 384 insertions(+), 21 deletions(-)

diff --git a/KunQuant/ops/MiscOp.py b/KunQuant/ops/MiscOp.py
index 218d487..cff7bbe 100644
--- a/KunQuant/ops/MiscOp.py
+++ b/KunQuant/ops/MiscOp.py
@@ -72,7 +72,13 @@ def verify(self, func: 'KunQuant.Stage.Function') -> None:
     
 class ReturnFirstValue(OpBase):
     '''
-    Return the first value of the input. It is used keep the dependency of the input op, like SetAccumulator.
+    Return inputs[0] as this op's value; the remaining inputs are kept
+    only as dependencies (graph-level keep-alives).
+
+    KunQuant's Python IR is a graph IR — an op with no users is dropped
+    during topo sort / GC.  SetAccumulator is side-effecting but produces
+    no consumer-visible value, so attaching it as inputs[1:] of
+    ReturnFirstValue is how we keep it reachable from a graph output.
     '''
     def __init__(self, v: List[OpBase]) -> None:
         super().__init__(v, [])
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 384da50..39e049c 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -38,7 +38,10 @@
 from KunQuant.ops.ReduceOp import (
     ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
 )
-from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
+from KunQuant.ops.MiscOp import (
+    BackRef, FastWindowedSum,
+    Accumulator, SetAccumulator, ReturnFirstValue,
+)
 from KunQuant.Stage import Function
 
 
@@ -145,6 +148,27 @@ def _emit_simple(op: OpBase,
         v = op.attrs["value"]
         fv = float("nan") if v == "nan" else float(v)
         return ir.constant(fv, ts_1)
+    if isinstance(op, Accumulator):
+        # The Python op's `inputs[0]` is a keep-alive in the graph IR;
+        # it does NOT feed the slot.  Only the `name` attr matters at
+        # the MLIR level — same-name accumulators CSE to one slot.
+        return ir.accumulator(op.attrs["name"], ts_1)
+    if isinstance(op, SetAccumulator):
+        # Side-effect: returns no SSA value.  ReturnFirstValue is what
+        # keeps this op alive in the Python graph (see MiscOp.py).
+        ir.set_accumulator(val_map[op.inputs[0]],
+                            val_map[op.inputs[1]],
+                            val_map[op.inputs[2]])
+        return None
+    if isinstance(op, ReturnFirstValue):
+        # In the Python graph IR, ReturnFirstValue's only job is to keep
+        # side-effecting siblings (SetAccumulator etc.) reachable from a
+        # graph output so the GC does not drop them.  In SSA-MLIR the
+        # side-effect ops are preserved by their own MemWrite semantics;
+        # ReturnFirstValue carries no new MLIR-level meaning, so we just
+        # forward the first input's Value.  Other inputs were already
+        # emitted in topo order before we got here.
+        return val_map[op.inputs[0]]
     raise NotImplementedError(
         f"CodegenMLIR: op type {cls.__name__} is not supported by the "
         f"GPU backend yet (op = {op})")
diff --git a/mlir/include/KunGpu/KunGpuOps.h b/mlir/include/KunGpu/KunGpuOps.h
index 9d4893d..27eafe5 100644
--- a/mlir/include/KunGpu/KunGpuOps.h
+++ b/mlir/include/KunGpu/KunGpuOps.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
index f81ddab..0e522fc 100644
--- a/mlir/include/KunGpu/KunGpuOps.td
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -92,7 +92,22 @@ def KunGpu_TimeUbOp : KunGpu_Op<"time_ub", [Pure]> {
 // (or `value` for put) must match the ts element type.
 //===----------------------------------------------------------------------===//
 
-def KunGpu_WindowedTempOp : KunGpu_Op<"windowed_temp", [Pure]> {
+def KunGpu_AccumulatorOp : KunGpu_Op<"accumulator", [Pure]> {
+  let summary = "Allocate a single-slot per-thread accumulator (alloca)";
+  let description = [{
+    Allocates a per-thread single-slot register backing a `kunir.accumulator`.
+    The result is a `ts<T, 1>` handle that ts.put / ts.get treat at offset 0
+    only — there is no time dimension and no circular indexing.  The slot is
+    zero-initialised at allocation time.
+
+    Pure with a `name` StrAttr — same-name accumulators CSE to one slot.
+  }];
+  let arguments = (ins StrAttr:$name);
+  let results = (outs KunIr_AnyTs:$result);
+  let assemblyFormat = "$name `:` type($result) attr-dict";
+}
+
+def KunGpu_WindowedTempOp : KunGpu_Op<"windowed_temp", []> {
   let summary = "Allocate a per-thread windowed (circular) temporary buffer";
   let description = [{
     Allocates a thread-local circular buffer whose element type and window
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 2f6cd30..5fd73df 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -179,6 +179,47 @@ def KunIr_ConstantOp : KunIr_Op<"constant", [Pure]> {
   let assemblyFormat = "$value `:` type($result) attr-dict";
 }
 
+//===----------------------------------------------------------------------===//
+// Accumulator / SetAccumulator
+//
+// A scalar "register" that survives across time steps within one kernel
+// invocation.  Modeled as a ts<T, 1> handle so existing ts.get / ts.put
+// load and store its current value.  The handle storage is owned by a
+// single LLVM alloca after kungpu-to-llvm lowering.
+//
+// `kunir.accumulator` is Pure with a `name` StrAttr — two accumulators
+// sharing the same name CSE to a single handle (single backing slot).
+//
+// `kunir.set_accumulator` is NOT Pure: writing to the slot is a side
+// effect and must not be CSE'd or hoisted past dependent reads.  When the
+// scalar `mask` is true at the current time step the accumulator is
+// overwritten with `value`; otherwise it retains the previous value.
+//===----------------------------------------------------------------------===//
+
+def KunIr_AccumulatorOp : KunIr_Op<"accumulator", [Pure]> {
+  let summary = "Stateful single-slot scalar register (read via ts.get @0)";
+  let arguments = (ins StrAttr:$name);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$name `:` type($result) attr-dict";
+}
+
+def KunIr_SetAccumulatorOp : KunIr_Op<"set_accumulator"> {
+  let summary = "Conditionally overwrite an accumulator slot";
+  let description = [{
+    `acc` must be the result of a `kunir.accumulator`.  When the scalar
+    `mask` is true at the current time step, stores `value` into the
+    accumulator slot; otherwise the slot is unchanged.  Side-effecting
+    (NOT Pure): never CSE / dedup.
+  }];
+  let arguments = (ins KunIr_AnyTs:$acc,
+                       KunIr_AnyTs:$mask,
+                       KunIr_AnyTs:$value);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$acc `,` $mask `,` $value `:` type($acc) `,` type($mask) `,` type($value) attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // Cross-sectional ops
 //
diff --git a/mlir/lib/KunGpu/KunGpuOps.cpp b/mlir/lib/KunGpu/KunGpuOps.cpp
index 11b2ac0..ba08734 100644
--- a/mlir/lib/KunGpu/KunGpuOps.cpp
+++ b/mlir/lib/KunGpu/KunGpuOps.cpp
@@ -12,12 +12,13 @@ using namespace kungpu;
 #include "KunGpu/KunGpuOps.cpp.inc"
 
 // The `ts` operand of ts.get and ts.put must be a function argument (block
-// argument of an entry block) or the result of a windowed_temp op.
+// argument of an entry block), the result of a windowed_temp op, or the
+// result of an accumulator op.
 static bool isValidTsSource(Value v) {
   if (isa<BlockArgument>(v))
     return true;
   if (auto *def = v.getDefiningOp())
-    return isa<WindowedTempOp>(def);
+    return isa<WindowedTempOp, AccumulatorOp>(def);
   return false;
 }
 
@@ -33,7 +34,8 @@ LogicalResult TsGetOp::verify() {
            << "' must match ts element type '" << tsTy.getElementType() << "'";
   if (!isValidTsSource(getTs()))
     return emitOpError("ts operand must be a function argument or "
-                       "the result of 'kungpu.windowed_temp'");
+                       "the result of 'kungpu.windowed_temp' / "
+                       "'kungpu.accumulator'");
   return success();
 }
 
@@ -49,6 +51,7 @@ LogicalResult TsPutOp::verify() {
            << "' must match ts element type '" << tsTy.getElementType() << "'";
   if (!isValidTsSource(getTs()))
     return emitOpError("ts operand must be a function argument or "
-                       "the result of 'kungpu.windowed_temp'");
+                       "the result of 'kungpu.windowed_temp' / "
+                       "'kungpu.accumulator'");
   return success();
 }
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 94f84bc..1bb0cc8 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -73,15 +73,17 @@ namespace {
 
 // Per-windowed_temp side state.
 //   posPtr — i32 alloca holding the next-writable circular position.
+//            NULL means the entry is an accumulator (single slot, no
+//            circular wrap; ts.get / ts.put always touch slot 0).
 //   stride — slot stride in bytes-of-T units:
 //              1 for local (alloca buffer is per-thread)
 //              K for shared (slot-major across the K threads in a block);
 //                K = warps_per_cta * 32, captured as an i32 SSA value.
-// Keyed on the original windowed_temp result Value so the ts.get / ts.put
-// patterns can find it.
+// Keyed on the original windowed_temp / accumulator result Value so the
+// ts.get / ts.put patterns can find it.
 struct WTDesc {
-  Value posPtr;
-  int64_t stride; // 1 → no multiply at access time
+  Value posPtr;     // null → accumulator (no position counter)
+  int64_t stride;   // 1 → no multiply at access time
 };
 using WTDescMap = llvm::DenseMap<Value, WTDesc>;
 
@@ -513,6 +515,54 @@ struct WindowedTempPattern : OpConversionPattern<WindowedTempOp> {
   }
 };
 
+// kungpu.accumulator → single-slot alloca, zero-initialised.  Modeled in
+// descMap with a null posPtr so the ts.get / ts.put dispatch can recognise
+// it and emit a plain load/store at slot 0 (no circular wrap, no position
+// counter).  The op MUST be lowered for offset = 0 only — verified at the
+// ts.get / ts.put pattern level.
+struct AccumulatorPattern : OpConversionPattern<kungpu::AccumulatorOp> {
+  WTDescMap &descMap;
+  AccumulatorPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m)
+      : OpConversionPattern(tc, ctx), descMap(m) {}
+
+  LogicalResult
+  matchAndRewrite(kungpu::AccumulatorOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+
+    auto tsTy   = llvm::cast<TsType>(op.getType());
+    Type elemTy = tsTy.getElementType();
+
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    if (!fn)
+      return rewriter.notifyMatchFailure(
+          op, "kungpu.accumulator must be inside a gpu.func");
+
+    // Alloca + zero-init at function entry so the slot is well-defined
+    // before the time loop begins.
+    Value bufPtr;
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      Block &entry = fn.getBody().front();
+      rewriter.setInsertionPointToStart(&entry);
+      Value c1_i32 = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      bufPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, elemTy, c1_i32);
+      Value zero = rewriter.create<LLVM::ConstantOp>(
+          loc, elemTy, rewriter.getZeroAttr(elemTy));
+      rewriter.create<LLVM::StoreOp>(loc, zero, bufPtr);
+    }
+
+    // posPtr = null → ts.get / ts.put treat as accumulator (slot 0 only).
+    descMap[op.getResult()] = {Value(), 1};
+    rewriter.replaceOp(op, bufPtr);
+    return success();
+  }
+};
+
 // Multiply an i32 index by a compile-time stride.  stride==1 is a no-op.
 static Value applyStride(OpBuilder &b, Location loc, Value idx, int64_t stride,
                           Type i32Ty) {
@@ -545,11 +595,24 @@ struct TsGetPattern : OpConversionPattern<TsGetOp> {
 
     auto it = descMap.find(op.getTs());
     if (it != descMap.end()) {
+      const WTDesc &desc = it->second;
+      // ── accumulator: single-slot load.  offset must be 0. ─────────
+      if (!desc.posPtr) {
+        int64_t offsetVal = -1;
+        if (auto a = offsetI32.getDefiningOp<arith::ConstantOp>())
+          offsetVal = llvm::cast<IntegerAttr>(a.getValue()).getInt();
+        else if (auto l = offsetI32.getDefiningOp<LLVM::ConstantOp>())
+          offsetVal = llvm::cast<IntegerAttr>(l.getValue()).getInt();
+        if (offsetVal != 0)
+          return rewriter.notifyMatchFailure(
+              op, "ts.get on accumulator must use offset = 0");
+        rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, tsPtr);
+        return success();
+      }
       // ── windowed_temp: circular get without modulo ────────────────
       //   adj = offset + 1                  (offset=0 → most-recent put)
       //   idx = pos >= adj ? pos - adj : pos + N - adj
       //   return buf[idx * stride]
-      const WTDesc &desc = it->second;
       int64_t N = static_cast<int64_t>(
           llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
       Value pos    = rewriter.create<LLVM::LoadOp>(loc, i32Ty, desc.posPtr);
@@ -612,10 +675,16 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
 
     auto it = descMap.find(op.getTs());
     if (it != descMap.end()) {
+      const WTDesc &desc = it->second;
+      // ── accumulator: single-slot store, no pos counter to advance. ─
+      if (!desc.posPtr) {
+        rewriter.create<LLVM::StoreOp>(loc, v, tsPtr);
+        rewriter.eraseOp(op);
+        return success();
+      }
       // ── windowed_temp: store at buf[pos*stride], then advance pos ─
       //   buf[pos * stride] = v
       //   pos = (pos + 1 >= N) ? 0 : pos + 1
-      const WTDesc &desc = it->second;
       int64_t N = static_cast<int64_t>(
           llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
       Value pos = rewriter.create<LLVM::LoadOp>(loc, i32Ty, desc.posPtr);
@@ -897,7 +966,8 @@ struct ConvertKunGpuToLLVMPass
     target.addLegalDialect<arith::ArithDialect, scf::SCFDialect,
                            LLVM::LLVMDialect, gpu::GPUDialect>();
     target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
-    target.addIllegalOp<WindowedTempOp, TsGetOp, TsPutOp,
+    target.addIllegalOp<WindowedTempOp, kungpu::AccumulatorOp,
+                        TsGetOp, TsPutOp,
                         TimeLengthOp, TimeLbOp, TimeUbOp,
                         StockIdOp, BlockStockCountOp>();
     target.addIllegalOp<kunir::FastWindowedSumOp>();
@@ -920,6 +990,7 @@ struct ConvertKunGpuToLLVMPass
     patterns.add<TimeLengthPattern, TimeLbPattern, TimeUbPattern,
                   StockIdPattern, BlockStockCountPattern>(typeConv, ctx);
     patterns.add<WindowedTempPattern>(typeConv, ctx, descMap, smemCounter);
+    patterns.add<AccumulatorPattern>(typeConv, ctx, descMap);
     patterns.add<TsGetPattern>(typeConv, ctx, descMap);
     patterns.add<TsPutPattern>(typeConv, ctx, descMap, chunkCtx);
     patterns.add<FastWindowedSumPattern>(typeConv, ctx);
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 63a0522..62f955a 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -232,6 +232,40 @@ LogicalResult ConstantOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// AccumulatorOp / SetAccumulatorOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AccumulatorOp::verify() {
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+  if (resultTy.getMaxLookback() != 1)
+    return emitOpError("accumulator result maxLookback must be 1, got ")
+           << resultTy.getMaxLookback();
+  if (getName().empty())
+    return emitOpError("accumulator name must be non-empty");
+  return success();
+}
+
+LogicalResult SetAccumulatorOp::verify() {
+  auto *accOp = getAcc().getDefiningOp();
+  if (!accOp || !llvm::isa<AccumulatorOp>(accOp))
+    return emitOpError(
+        "first operand must be the result of a 'kunir.accumulator'");
+  auto accTy   = llvm::cast<TsType>(getAcc().getType());
+  auto maskTy  = llvm::cast<TsType>(getMask().getType());
+  auto valueTy = llvm::cast<TsType>(getValue().getType());
+  if (accTy.getElementType() != valueTy.getElementType())
+    return emitOpError("value element type '")
+           << valueTy.getElementType()
+           << "' must match accumulator element type '"
+           << accTy.getElementType() << "'";
+  if (!llvm::isa<IntegerType>(maskTy.getElementType()) ||
+      llvm::cast<IntegerType>(maskTy.getElementType()).getWidth() != 1)
+    return emitOpError("mask element type must be i1, got '")
+           << maskTy.getElementType() << "'";
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ForEachBackWindowOp — verifier + custom assembly format
 //
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 42517eb..b37ef90 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -73,10 +73,18 @@ using HandleMap = llvm::DenseMap<Value, Value>;
 // One LowerHelper per scope (outer function body / for_each_back_window body).
 // `zeroOffsetI32` is the function-scope i32 zero constant created once before
 // the outer scf.for; all LowerHelper instances share it.
+//
+// `outerTimeIdx` / `outerLoopLb` are the outer scf.for time loop's induction
+// variable and lower bound (index type).  BackRef's warmup guard
+// (`t - loop_lb < window` → NaN) needs both; threading them through
+// LowerHelper keeps the guard available inside for_each_back_window bodies
+// as well, since `t` there is still the OUTER time index.
 struct LowerHelper {
   HandleMap tsMap;
   HandleMap scalarMap;
   Value zeroOffsetI32;
+  Value outerTimeIdx;   // outer scf.for induction var (index)
+  Value outerLoopLb;    // outer scf.for lower bound (index)
 
   // Shared util: look up `v` (a ts SSA value) in tsMap, emit
   // ts.get(handle, offsetI32), return the loaded scalar.  Does NOT touch
@@ -141,11 +149,57 @@ struct LowerHelper {
         scalarMap[sel.getResult()] =
             b.create<arith::SelectOp>(ol, cond, tv, fv).getResult();
       } else if (auto br = dyn_cast<BackRefOp>(op)) {
-        Value offset = b.create<arith::ConstantOp>(
-            ol, b.getI32Type(), b.getI32IntegerAttr(br.getWindow()));
-        KUN_ASSIGN_OR_FAIL(Value scalar,
-            getScalarUncached(br.getInput(), offset, b, ol));
-        scalarMap[br.getResult()] = scalar;
+        // Warmup guard:  if   t - outer_loop_lb < window  →  NaN
+        //                else                            →  ts.get(window)
+        //
+        // Chunk 0 has loop_lb = 0 so the guard collapses to CPU's
+        // "first window-1 outputs are NaN".  Chunk k>=1's per-CTA state
+        // (e.g. fast-stat accumulators) is zero-initialised at function
+        // entry, so each chunk needs `window` add-only steps to rebuild
+        // the trailing-window state — gating the "remove" value with
+        // NaN here propagates through NaN-aware remove patterns
+        // (Equals(oldx, oldx) === false on NaN) and auto-suppresses the
+        // subtract step during warmup.
+        //
+        // The scf.if uses the manual-OpBuilder (not body-builder-lambda)
+        // form, so the enclosing function context is still `lowerBlock`
+        // — KUN_ASSIGN_OR_FAIL can return failure from here without
+        // tripping any lambda return-type mismatch.
+        int64_t window = br.getWindow();
+        auto inputTs = llvm::cast<TsType>(br.getInput().getType());
+        auto floatTy = llvm::dyn_cast<FloatType>(inputTs.getElementType());
+        if (!floatTy)
+          return br.emitError("kunir-to-kungpu: back_ref input must have a "
+                              "float element type (NaN required for the "
+                              "warmup guard)");
+
+        Value delta =
+            b.create<arith::SubIOp>(ol, outerTimeIdx, outerLoopLb);
+        Value windowIdx =
+            b.create<arith::ConstantIndexOp>(ol, window);
+        Value inSteady = b.create<arith::CmpIOp>(
+            ol, arith::CmpIPredicate::sge, delta, windowIdx);
+        auto ifOp = b.create<scf::IfOp>(ol, TypeRange{floatTy}, inSteady,
+                                          /*withElseRegion=*/true);
+        {
+          OpBuilder ib =
+              OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
+          Value offset = ib.create<arith::ConstantOp>(
+              ol, ib.getI32Type(), ib.getI32IntegerAttr(window));
+          KUN_ASSIGN_OR_FAIL(Value loaded,
+              getScalarUncached(br.getInput(), offset, ib, ol));
+          ib.create<scf::YieldOp>(ol, loaded);
+        }
+        {
+          OpBuilder ib =
+              OpBuilder::atBlockBegin(&ifOp.getElseRegion().front());
+          llvm::APFloat qnan =
+              llvm::APFloat::getQNaN(floatTy.getFloatSemantics());
+          Value nanV = ib.create<arith::ConstantOp>(
+              ol, floatTy, FloatAttr::get(floatTy, qnan));
+          ib.create<scf::YieldOp>(ol, nanV);
+        }
+        scalarMap[br.getResult()] = ifOp.getResult(0);
       } else if (auto co = dyn_cast<ConstantOp>(op)) {
         auto resTs = llvm::cast<TsType>(co.getResult().getType());
         Type elemTy = resTs.getElementType();
@@ -279,6 +333,8 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   // ------------------------------------------------------------------
   LowerHelper outer;
   outer.zeroOffsetI32 = zeroOffsetI32;
+  outer.outerTimeIdx  = outerFor.getInductionVar();
+  outer.outerLoopLb   = outerFor.getLowerBound();
   unsigned numOrigArgs = oldFT.getNumInputs();
   for (unsigned i = 0; i < numOrigArgs; ++i) {
     Value arg = entry.getArgument(i);
@@ -370,7 +426,9 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
             //
             // After this setup the body has no non-zero-offset reads left;
             // lowerBlock just uses offset 0 + scalarMap for everything.
-            LowerHelper inner{outer.tsMap, outer.scalarMap, outer.zeroOffsetI32};
+            LowerHelper inner{outer.tsMap, outer.scalarMap,
+                                outer.zeroOffsetI32,
+                                outer.outerTimeIdx, outer.outerLoopLb};
             for (auto [i, arg] : llvm::enumerate(body.getArguments())) {
               auto r = inner.getScalarUncached(fwOp.getInputs()[i],
                                                 windowedOffset, ib, il);
@@ -404,6 +462,35 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       return success();
     }
 
+    // kunir.accumulator → kungpu.accumulator (allocated outside the time
+    // loop, like windowed_temp).  Stored in tsMap so that downstream reads
+    // (via getScalar → kungpu.ts.get @ offset 0) resolve to the slot.
+    if (auto acc = dyn_cast<kunir::AccumulatorOp>(op)) {
+      auto ka = b.create<kungpu::AccumulatorOp>(
+          ol, acc.getResult().getType(), acc.getNameAttr());
+      outer.tsMap[acc.getResult()] = ka.getResult();
+      return success();
+    }
+
+    // kunir.set_accumulator → scf.if (mask) { kungpu.ts.put %acc, %value }
+    // inside the outer time loop.  mask and value are loaded at offset 0
+    // (current time step) via the standard scalarMap-cached getScalar.
+    if (auto sa = dyn_cast<kunir::SetAccumulatorOp>(op)) {
+      auto accIt = outer.tsMap.find(sa.getAcc());
+      if (accIt == outer.tsMap.end())
+        return op.emitError("kunir-to-kungpu: set_accumulator acc must come "
+                            "from a kunir.accumulator");
+      KUN_ASSIGN_OR_FAIL(Value maskScalar,
+                         outer.getScalar(sa.getMask(),  fb, ol));
+      KUN_ASSIGN_OR_FAIL(Value valueScalar,
+                         outer.getScalar(sa.getValue(), fb, ol));
+      auto ifOp = fb.create<scf::IfOp>(ol, /*resultTypes=*/TypeRange{},
+                                         maskScalar, /*withElseRegion=*/false);
+      OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
+      ib.create<TsPutOp>(ol, accIt->second, valueScalar);
+      return success();
+    }
+
     // fast_windowed_sum → preserved as a kunir op with scalar result and
     // ts-handle input.  The kungpu-to-llvm pass owns the actual lowering
     // (per-thread state allocas + the Kahan-corrected step).
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 08efd51..65be763 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -171,6 +171,15 @@ class IRBuilder {
     return b_.create<kunir::ConstantOp>(b_.getUnknownLoc(), tsTy, attr);
   }
 
+  // ── Accumulator / SetAccumulator ───────────────────────────────
+  Value accumulatorOp(std::string name, Type tsTy) {
+    return b_.create<kunir::AccumulatorOp>(b_.getUnknownLoc(), tsTy,
+                                            b_.getStringAttr(name));
+  }
+  void setAccumulatorOp(Value acc, Value mask, Value value) {
+    b_.create<kunir::SetAccumulatorOp>(b_.getUnknownLoc(), acc, mask, value);
+  }
+
   // ── Windowed buffer materialization ───────────────────────────────
   Value windowedOutputOp(Value x, int64_t length) {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
@@ -363,6 +372,15 @@ void registerIRBuilder(nb::module_ &m) {
             "Build a kunir.constant of element-type matching `type` (a "
             "ts<T, 1>).  Pass float('nan') for NaN.")
 
+      .def("accumulator", &IRBuilder::accumulatorOp,
+            nb::arg("name"), nb::arg("type"),
+            "Build a kunir.accumulator with the given name and ts<T, 1> "
+            "result type.  Same-name accumulators CSE to a single slot.")
+      .def("set_accumulator", &IRBuilder::setAccumulatorOp,
+            nb::arg("acc"), nb::arg("mask"), nb::arg("value"),
+            "Conditionally store `value` into `acc` when `mask` is true. "
+            "Side-effecting; returns no SSA value.")
+
       .def("select", &IRBuilder::selectOp,
             nb::arg("cond"), nb::arg("true_value"), nb::arg("false_value"))
 
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 7aa2930..0bffe5a 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -35,7 +35,10 @@
     GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
     And, Or, Not, Select,
 )
-from KunQuant.ops.MiscOp import BackRef, FastWindowedSum
+from KunQuant.ops.MiscOp import (
+    BackRef, FastWindowedSum,
+    Accumulator, SetAccumulator, ReturnFirstValue,
+)
 from KunQuant.Stage import Function
 from KunQuant.jit import KunMLIR
 from KunQuant.jit.cuda import compileit, CudaCompilerConfig
@@ -115,6 +118,34 @@ def build_func_fastwindowedsum(N: int) -> Function:
     return Function(builder.ops, name="fastwindowedsum_kernel")
 
 
+def build_func_accumulator() -> Function:
+    """Running count of timesteps where a > 0:
+
+       cnt[t] = cnt[t-1] + (a[t] > 0 ? 1 : 0)            (cnt[-1] = 0)
+
+    Built directly with Accumulator + SetAccumulator + ReturnFirstValue:
+       cnt    = Accumulator(a, "cnt")             # reads slot (init 0)
+       mask   = a > 0
+       new    = Select(mask, cnt + 1, cnt)
+       sa     = SetAccumulator(cnt, mask, new)
+       Output(ReturnFirstValue([new, sa]), "cnt_out")
+
+    Exercises the Accumulator end-to-end: kunir.accumulator (CSE'd to one
+    slot), kunir.set_accumulator (non-Pure, scf.if-wrapped store at
+    offset 0) and ReturnFirstValue's keep-alive role for the side-effect
+    op when lowered to MLIR.
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        cnt = Accumulator(a, "cnt")
+        mask = GreaterThan(a, ConstantOp(0))
+        new_cnt = Select(mask, Add(cnt, ConstantOp(1)), cnt)
+        sa = SetAccumulator(cnt, mask, new_cnt)
+        Output(ReturnFirstValue([new_cnt, sa]), "cnt_out")
+    return Function(builder.ops, name="accumulator_kernel")
+
+
 def build_func_cmp_logical() -> Function:
     """Single-graph multi-output factor that exercises every kunir cmp,
     logical, and select op in one shot:
@@ -352,6 +383,36 @@ def run_multipartition(target: str, T: int, S: int) -> int:
     return 0
 
 
+def run_accumulator(target: str, T: int, S: int) -> int:
+    """End-to-end correctness of Accumulator + SetAccumulator +
+    ReturnFirstValue: cnt[t] = cnt[t-1] + (a[t] > 0 ? 1 : 0).
+
+    Forced single-chunk (sm_fill_factor=0.0): a general-purpose
+    Accumulator has no warmup-replay mechanism, so its per-CTA alloca
+    cannot be re-primed at chunk boundaries.  unreliable_count=0 leaves
+    the runtime free to split the time axis into many chunks; we disable
+    that here to keep the slot's value continuous across t."""
+    print(f"=== accumulator: cnt[t] = cnt[t-1] + (a[t] > 0) ===")
+    f = build_func_accumulator()
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = compileit(f, cfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(13)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe, {"a": cp.asarray(a_h), "cnt_out": out},
+                       sm_fill_factor=0.0)
+    out_h = cp.asnumpy(out)
+
+    expected = np.cumsum((a_h > 0).astype(np.float32), axis=0)
+    return _compare_post_warmup(out_h, expected, valid_start=0, atol=1e-5)
+
+
 def run_cmp_logical(target: str, T: int, S: int) -> int:
     """End-to-end test for kunir.gt/ge/lt/le/eq + and/or/not + select.
 
@@ -596,6 +657,8 @@ def main() -> int:
     print()
     rc |= run_multipartition(args.target, args.time_length, args.num_stocks)
     print()
+    rc |= run_accumulator(args.target, args.time_length, args.num_stocks)
+    print()
     rc |= run_cmp_logical(args.target, args.time_length, args.num_stocks)
     return rc
 

From 0a610dbef0f9e49a0a5316aec01ac3a0777de355 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 14 May 2026 02:10:46 -0700
Subject: [PATCH 27/59] fix require whole time

---
 KunQuant/Op.py                       |  8 +++++++
 KunQuant/jit/cuda.py                 | 19 ++++++++++++++--
 KunQuant/ops/MiscOp.py               | 19 +++++++++++-----
 doc/Operators.md                     | 22 +++++++++++++++----
 mlir/include/KunIr/KunIrOps.td       |  2 +-
 mlir/lib/KunCuda/Runtime.cpp         | 12 +++++++++-
 mlir/lib/KunIr/KunIrOps.cpp          | 15 ++++++++-----
 mlir/lib/Python/IRBuilder.cpp        |  6 +++--
 mlir/test/python/test_kun_to_cuda.py | 33 +++++++++++-----------------
 9 files changed, 96 insertions(+), 40 deletions(-)

diff --git a/KunQuant/Op.py b/KunQuant/Op.py
index bd96014..e2696f9 100644
--- a/KunQuant/Op.py
+++ b/KunQuant/Op.py
@@ -509,6 +509,14 @@ class StateConsumerTrait:
     '''
     pass
 
+class MayRequireWholeTime:
+    '''
+    Ops whose state may depend on the full time history (cannot be rebuilt
+    from a bounded warmup window).  Override to declare otherwise.
+    '''
+    def is_whole_time_required(self) -> bool:
+        return False
+
 class ReductionOp(OpBase, StatefulOpTrait):
     '''
     Base class of all reduction ops. A reduction op takes inputs that is originated from a IterValue. The input must be in a loop (v.get_parent() is a loop). The data produced
diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 958d706..52aebd5 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -31,13 +31,20 @@
 from KunQuant.jit import KunMLIR
 
 from KunQuant.Driver import optimize, post_optimize
-from KunQuant.Op import Input, Output
+from KunQuant.Op import Input, Output, MayRequireWholeTime
 from KunQuant.passes import do_partition
 from KunQuant.passes.InferWindow import infer_window
 from KunQuant.Stage import Function
 from KunQuant.passes.CodegenMLIR import TargetSpec, translate_function
 
 
+# Sentinel passed via kunir.func's `unreliable_count` attribute to mean
+# "this partition needs the full time history; the runtime must launch
+# it as a single chunk".  Kept in sync with the kunir verifier (which
+# only allows -1 or non-negative) and the CUDA runtime's `computeChunkPlan`.
+_WHOLE_TIME_UNRELIABLE = -1
+
+
 # Standard locations searched when CudaCompilerConfig.toolkit_path is left
 # empty.  A toolkit dir must contain `nvvm/libdevice/libdevice.10.bc` (the
 # upstream `gpu-module-to-binary` pass links libdevice into the LLVM
@@ -212,7 +219,15 @@ def _translate_partitions(impl, cfg: CudaCompilerConfig):
         # by the time this kernel runs, so we don't accumulate their
         # unreliable counts here.  infer_window walks back to Input ops
         # of the partition; cross-partition deps stop at those Inputs.
-        per_kernel_unreliable = max(infer_window(sub).values(), default=0)
+        # If any op in this partition requires the whole time history,
+        # override the inferred warmup with the sentinel so the runtime
+        # collapses this kernel to a single chunk.
+        if any(isinstance(op, MayRequireWholeTime)
+                and op.is_whole_time_required()
+                for op in sub.ops):
+            per_kernel_unreliable = _WHOLE_TIME_UNRELIABLE
+        else:
+            per_kernel_unreliable = max(infer_window(sub).values(), default=0)
         ext = translate_function(sub, target, ir, dtype=dtype,
                                    unreliable_count=per_kernel_unreliable)
         if ext is not None:
diff --git a/KunQuant/ops/MiscOp.py b/KunQuant/ops/MiscOp.py
index cff7bbe..01ea1af 100644
--- a/KunQuant/ops/MiscOp.py
+++ b/KunQuant/ops/MiscOp.py
@@ -1,5 +1,5 @@
 import KunQuant
-from KunQuant.Op import AcceptSingleValueInputTrait, Input, OpBase, WindowedTrait, SinkOpTrait, CrossSectionalOp, GlobalStatefulProducerTrait, GloablStatefulOpTrait, StateConsumerTrait, UnaryElementwiseOp, BinaryElementwiseOp
+from KunQuant.Op import AcceptSingleValueInputTrait, Input, OpBase, WindowedTrait, SinkOpTrait, CrossSectionalOp, GlobalStatefulProducerTrait, GloablStatefulOpTrait, StateConsumerTrait, MayRequireWholeTime, UnaryElementwiseOp, BinaryElementwiseOp
 from typing import List, Tuple, Union
 
 class BackRef(OpBase, WindowedTrait):
@@ -29,12 +29,17 @@ def get_state_variable_name_prefix(self) -> str:
     def generate_step_code(self, idx: str, time_idx: str, inputs: List[str], buf_name: str) -> str:
         return f"auto v{idx} = sum_{idx}.step({buf_name}, {inputs[0]}, {time_idx});"
 
-class Accumulator(OpBase, GlobalStatefulProducerTrait):
+class Accumulator(OpBase, GlobalStatefulProducerTrait, MayRequireWholeTime):
     '''
     Accumulator is a stateful op that accumulates the input value over time.
     It can be used to compute running totals, moving averages, etc.'''
-    def __init__(self, v: OpBase, name: str) -> None:
-        super().__init__([v], [("name", name)])
+    def __init__(self, v: OpBase, name: str,
+                  is_whole_time_required: bool = False) -> None:
+        super().__init__([v],
+                          [("name", name),
+                           ("whole_time", is_whole_time_required)])
+    def is_whole_time_required(self) -> bool:
+        return self.attrs["whole_time"]
     def get_state_variable_name_prefix(self) -> str:
         return "accu_"
     
@@ -84,7 +89,8 @@ def __init__(self, v: List[OpBase]) -> None:
         super().__init__(v, [])
     
 
-class ExpMovingAvg(OpBase, GloablStatefulOpTrait, AcceptSingleValueInputTrait):
+class ExpMovingAvg(OpBase, GloablStatefulOpTrait, AcceptSingleValueInputTrait,
+                    MayRequireWholeTime):
     '''
     Exponential Moving Average (EMA)
     Similar to pd.DataFrame.ewm(span=window, adjust=False, ignore_na=True).mean()
@@ -121,6 +127,9 @@ def generate_init_code(self, idx: str, elem_type: str, simd_lanes: int, inputs:
     def generate_step_code(self, idx: str, time_idx: str, inputs: List[str]) -> str:
         return f"auto v{idx} = ema_{idx}.step({inputs[0]}, {time_idx});"
 
+    def is_whole_time_required(self) -> bool:
+        return True
+
 class WindowedLinearRegression(OpBase, WindowedTrait, GlobalStatefulProducerTrait):
     '''
     Compute states of Windowed Linear Regression
diff --git a/doc/Operators.md b/doc/Operators.md
index 23fc27c..70e69d2 100644
--- a/doc/Operators.md
+++ b/doc/Operators.md
@@ -424,13 +424,14 @@ class WindowedQuantile(OpBase, WindowedTrait):
     def __init__(self, v: OpBase, window: int, q: float) -> None:
         pass
 
-class ExpMovingAvg(OpBase, GloablStatefulOpTrait):
+class ExpMovingAvg(OpBase, GloablStatefulOpTrait, MayRequireWholeTime):
     '''
     Exponential Moving Average (EMA)
     Similar to pd.DataFrame.ewm(span=window, adjust=False, ignore_na=True).mean()
     optional parameter: init_val, the initial values for EMA. It must be an Input op with attr
     {"single_value":True}. The name of the Input op should starts with "__init".
     It should be an input of shape (num_stocks,)
+    Always requires the whole time history.
     '''
     def __init__(self, v: OpBase, window: int, init_val: Union[Input, None] = None) -> None:
         pass
@@ -451,11 +452,15 @@ class ReturnFirstValue(OpBase):
     def __init__(self, v: List[OpBase]) -> None:
         pass
 
-class Accumulator(OpBase, GloablStatefulOpTrait):
+class Accumulator(OpBase, GlobalStatefulProducerTrait, MayRequireWholeTime):
     '''
     Accumulator is a stateful op that accumulates the input value over time.
-    It can be used to compute running totals, moving averages, etc.'''
-    def __init__(self, v: OpBase, name: str) -> None:
+    It can be used to compute running totals, moving averages, etc.
+    Set `is_whole_time_required=True` if the accumulator's state can only
+    be reconstructed from the full time history.
+    '''
+    def __init__(self, v: OpBase, name: str,
+                  is_whole_time_required: bool = False) -> None:
         pass
 
 class SetAccumulator(OpBase):
@@ -513,6 +518,15 @@ class StatefulOpTrait:
     pass
 
 
+class MayRequireWholeTime:
+    '''
+    Ops whose state may depend on the full time history (cannot be rebuilt
+    from a bounded warmup window).  Override to declare otherwise.
+    '''
+    def is_whole_time_required(self) -> bool:
+        return False
+
+
 class CrossSectionalOp(OpBase):
     def __init__(self, v: OpBase) -> None:
         pass
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 5fd73df..6f45248 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -443,7 +443,7 @@ def KunIr_FuncOp : KunIr_Op<"func", [
     ArrayAttr:$input_names,
     ArrayAttr:$output_names,
     KunIr_TargetSpecAttr:$target_spec,
-    I64Attr:$unreliable_count
+    SI64Attr:$unreliable_count
   );
   let regions = (region SizedRegion<1>:$body);
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index 9354d1e..f59c19a 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -467,6 +467,11 @@ static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
     return {timeLength, 1u};
   if (numSMs <= 0 || smFillFactor <= 0.0)
     return {timeLength, 1u};
+  // unreliableCount = -1 sentinel → whole time history required, single
+  // chunk only.  Any other negative value is rejected by the IR verifier;
+  // we don't try to interpret it.
+  if (unreliableCount < 0)
+    return {timeLength, 1u};
 
   int64_t blockX = warpsPerCta * 32;
   int64_t stocksPerBlock = blockX * vectorSize;
@@ -889,7 +894,12 @@ void Executable::launchOnStream(
           meta.unreliableCount, mask, minChunkWarmupFactor,
           smFillFactor, numSMs);
       int32_t chunkSizeI32 = static_cast<int32_t>(plan.chunkSize);
-      int32_t warmupI32    = static_cast<int32_t>(meta.unreliableCount);
+      // -1 sentinel (whole-time) means single chunk; the kernel's
+      // chunk-0 branch never reads the warmup arg in that case, but we
+      // still clamp to 0 so a stray load never observes a negative
+      // value in any future path.
+      int32_t warmupI32    = static_cast<int32_t>(
+          std::max<int64_t>(meta.unreliableCount, 0));
 
       std::vector<void *> argPtrs;
       argPtrs.reserve(5 + ptrs.size());
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 62f955a..3f5d6d6 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -582,7 +582,8 @@ void FuncOp::build(OpBuilder &b, OperationState &result,
   result.addAttribute(getOutputNamesAttrName(result.name), outputNames);
   result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
   result.addAttribute(getUnreliableCountAttrName(result.name),
-                        b.getI64IntegerAttr(unreliableCount));
+                        b.getIntegerAttr(b.getIntegerType(64, /*isSigned=*/true),
+                                          unreliableCount));
   Region *body = result.addRegion();
   Block *block = new Block;
   for (Type inputType : type.getInputs())
@@ -645,9 +646,12 @@ LogicalResult FuncOp::verify() {
     return emitOpError("target smem_size must be non-negative, got ")
            << ts.getSmemSize();
 
-  // Validate unreliable_count
-  if (getUnreliableCount() < 0)
-    return emitOpError("unreliable_count must be non-negative, got ")
+  // Validate unreliable_count.  `-1` is a sentinel meaning "whole time
+  // history required" — the runtime collapses such functions to a
+  // single chunk.  Any other negative value is rejected.
+  if (getUnreliableCount() < -1)
+    return emitOpError("unreliable_count must be -1 (whole-time) or "
+                       "non-negative, got ")
            << getUnreliableCount();
 
   return success();
@@ -715,7 +719,8 @@ ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
   int64_t unrelVal = 0;
   if (parser.parseInteger(unrelVal)) return failure();
   result.addAttribute(getUnreliableCountAttrName(result.name),
-                       b.getI64IntegerAttr(unrelVal));
+                       b.getIntegerAttr(b.getIntegerType(64, /*isSigned=*/true),
+                                         unrelVal));
 
   // -> (result_type, ...) or -> result_type  [optional]
   SmallVector<Type> resultTypes;
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 65be763..5ac3a67 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -88,9 +88,11 @@ class IRBuilder {
       throw std::runtime_error(
           "IRBuilder.begin_func: result_types and output_names must have "
           "the same length (non-void form: outputs become result types)");
-    if (unreliableCount < 0)
+    // `-1` is the whole-time sentinel.  Anything more negative is bogus.
+    if (unreliableCount < -1)
       throw std::runtime_error(
-          "IRBuilder.begin_func: unreliable_count must be non-negative, got "
+          "IRBuilder.begin_func: unreliable_count must be -1 (whole-time) "
+          "or non-negative, got "
           + std::to_string(unreliableCount));
 
     // Restore insertion point to the gpu.module body before starting a
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 0bffe5a..fc384c0 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -123,22 +123,14 @@ def build_func_accumulator() -> Function:
 
        cnt[t] = cnt[t-1] + (a[t] > 0 ? 1 : 0)            (cnt[-1] = 0)
 
-    Built directly with Accumulator + SetAccumulator + ReturnFirstValue:
-       cnt    = Accumulator(a, "cnt")             # reads slot (init 0)
-       mask   = a > 0
-       new    = Select(mask, cnt + 1, cnt)
-       sa     = SetAccumulator(cnt, mask, new)
-       Output(ReturnFirstValue([new, sa]), "cnt_out")
-
-    Exercises the Accumulator end-to-end: kunir.accumulator (CSE'd to one
-    slot), kunir.set_accumulator (non-Pure, scf.if-wrapped store at
-    offset 0) and ReturnFirstValue's keep-alive role for the side-effect
-    op when lowered to MLIR.
+    Built directly with Accumulator + SetAccumulator + ReturnFirstValue.
+    `is_whole_time_required=True` propagates `unreliable_count = -1` into
+    kunir.func; the runtime treats that as a hard "single chunk" signal.
     """
     builder = Builder()
     with builder:
         a = Input("a")
-        cnt = Accumulator(a, "cnt")
+        cnt = Accumulator(a, "cnt", is_whole_time_required=True)
         mask = GreaterThan(a, ConstantOp(0))
         new_cnt = Select(mask, Add(cnt, ConstantOp(1)), cnt)
         sa = SetAccumulator(cnt, mask, new_cnt)
@@ -387,12 +379,14 @@ def run_accumulator(target: str, T: int, S: int) -> int:
     """End-to-end correctness of Accumulator + SetAccumulator +
     ReturnFirstValue: cnt[t] = cnt[t-1] + (a[t] > 0 ? 1 : 0).
 
-    Forced single-chunk (sm_fill_factor=0.0): a general-purpose
-    Accumulator has no warmup-replay mechanism, so its per-CTA alloca
-    cannot be re-primed at chunk boundaries.  unreliable_count=0 leaves
-    the runtime free to split the time axis into many chunks; we disable
-    that here to keep the slot's value continuous across t."""
-    print(f"=== accumulator: cnt[t] = cnt[t-1] + (a[t] > 0) ===")
+    With default `sm_fill_factor` the runtime would normally split this
+    T-sized job into many chunks; the `is_whole_time_required=True` flag
+    on the Accumulator propagates `unreliable_count = -1` through the
+    kunir.func attr, and computeChunkPlan collapses to a single chunk.
+    A failure here means the sentinel path is broken — multi-chunk
+    accumulators silently reset across chunk boundaries."""
+    print(f"=== accumulator: cnt[t] = cnt[t-1] + (a[t] > 0)  "
+           f"(whole-time sentinel) ===")
     f = build_func_accumulator()
     cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
     exe = compileit(f, cfg)
@@ -405,8 +399,7 @@ def run_accumulator(target: str, T: int, S: int) -> int:
     out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h), "cnt_out": out},
-                       sm_fill_factor=0.0)
+    executor.runGraph(exe, {"a": cp.asarray(a_h), "cnt_out": out})
     out_h = cp.asnumpy(out)
 
     expected = np.cumsum((a_h > 0).astype(np.float32), axis=0)

From 5a304a45f132333a9536010468739955d1e30be4 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 14 May 2026 23:37:39 -0700
Subject: [PATCH 28/59] fix warnings

---
 cpp/Kun/CApi.cpp                 |   2 +-
 cpp/Kun/CorrWith.hpp             |   2 -
 cpp/Kun/MathUtil.hpp             |   8 +-
 cpp/Kun/Scale.hpp                |   1 -
 cpp/Kun/SkipList.cpp             |   2 -
 cpp/Kun/StreamBuffer.hpp         |   7 +-
 mlir/lib/KunGpu/KunGpuToLLVM.cpp | 349 +++++++++++++++----------------
 mlir/lib/KunIr/KunIrOps.cpp      |  42 ++--
 mlir/lib/KunIr/KunIrToKunGpu.cpp |  88 ++++----
 mlir/lib/Python/IRBuilder.cpp    |  34 +--
 10 files changed, 267 insertions(+), 268 deletions(-)

diff --git a/cpp/Kun/CApi.cpp b/cpp/Kun/CApi.cpp
index 653d04c..52f8faf 100644
--- a/cpp/Kun/CApi.cpp
+++ b/cpp/Kun/CApi.cpp
@@ -36,7 +36,7 @@ static std::shared_ptr<Library> *unwrapLibrary(KunLibraryHandle ptr) {
 KUN_API KunModuleHandle kunGetModuleFromLibrary(KunLibraryHandle lib,
                                                 const char *name) {
     auto &plib = *unwrapLibrary(lib);
-    return (KunModuleHandle)plib->getModule(name);
+    return (KunModuleHandle)const_cast<Module *>(plib->getModule(name));
 }
 
 KUN_API void kunUnloadLibrary(KunLibraryHandle ptr) {
diff --git a/cpp/Kun/CorrWith.hpp b/cpp/Kun/CorrWith.hpp
index 4ef7cea..425d75b 100644
--- a/cpp/Kun/CorrWith.hpp
+++ b/cpp/Kun/CorrWith.hpp
@@ -26,7 +26,6 @@ void CorrWith(RuntimeStage *stage, size_t time_idx,
         INPUT::getInput(&inbuf1, stage->stage->in_buffers[1], num_stocks);
     using T = typename std::decay<decltype(*input0)>::type;
     auto outinfo = stage->stage->out_buffers[0];
-    auto simd_len = stage->ctx->simd_len;
     T *output = stage->ctx->buffers[outinfo->id].getPtr<T>();
     auto time_end =
         std::min(__start + (time_idx + 1) * time_stride, __start + __length);
@@ -71,7 +70,6 @@ void RankCorrWith(RuntimeStage *stage, size_t time_idx,
         INPUT::getInput(&inbuf1, stage->stage->in_buffers[1], num_stocks);
     using T = typename std::decay<decltype(*input0)>::type;
     auto outinfo = stage->stage->out_buffers[0];
-    auto simd_len = stage->ctx->simd_len;
     T *output = stage->ctx->buffers[outinfo->id].getPtr<T>();
     auto time_end =
         std::min(__start + (time_idx + 1) * time_stride, __start + __length);
diff --git a/cpp/Kun/MathUtil.hpp b/cpp/Kun/MathUtil.hpp
index 71211a5..b8623fe 100644
--- a/cpp/Kun/MathUtil.hpp
+++ b/cpp/Kun/MathUtil.hpp
@@ -3,8 +3,12 @@
 
 namespace kun {
 namespace {
-size_t divideAndCeil(size_t x, size_t y) { return (x + y - 1) / y; }
-size_t roundUp(size_t x, size_t y) { return divideAndCeil(x, y) * y; }
+[[maybe_unused]] size_t divideAndCeil(size_t x, size_t y) {
+    return (x + y - 1) / y;
+}
+[[maybe_unused]] size_t roundUp(size_t x, size_t y) {
+    return divideAndCeil(x, y) * y;
+}
 
 } // namespace
 } // namespace kun
\ No newline at end of file
diff --git a/cpp/Kun/Scale.hpp b/cpp/Kun/Scale.hpp
index 4c79005..9861451 100644
--- a/cpp/Kun/Scale.hpp
+++ b/cpp/Kun/Scale.hpp
@@ -22,7 +22,6 @@ KUN_TEMPLATE_EXPORT void ScaleStocks(RuntimeStage *stage, size_t time_idx,
         INPUT::getInput(&inbuf, stage->stage->in_buffers[0], num_stocks);
     using T = typename std::decay<decltype(*input)>::type;
     auto outinfo = stage->stage->out_buffers[0];
-    auto simd_len = stage->ctx->simd_len;
     T *output = OUTPUT::getOutput(&stage->ctx->buffers[outinfo->id], outinfo,
                                   num_stocks);
     auto time_end =
diff --git a/cpp/Kun/SkipList.cpp b/cpp/Kun/SkipList.cpp
index eaf6cb3..90a585a 100644
--- a/cpp/Kun/SkipList.cpp
+++ b/cpp/Kun/SkipList.cpp
@@ -148,8 +148,6 @@ struct SkipListImpl {
     }
 
     double get(int i, size_t &index, bool &ret) const {
-        int level;
-
         if (i < 0 || i >= size) {
             ret = false;
             return 0;
diff --git a/cpp/Kun/StreamBuffer.hpp b/cpp/Kun/StreamBuffer.hpp
index 0c67529..1540aa2 100644
--- a/cpp/Kun/StreamBuffer.hpp
+++ b/cpp/Kun/StreamBuffer.hpp
@@ -16,10 +16,11 @@ struct StreamBuffer {
     // fix-me: we can store the pre-aligned stock_count to avoid re-computation
     // of roundUp
     alignas(64) char buf[0];
-    T *getBuffer() const { return (T *)(buf); }
+    T *getBuffer() const { return (T *)const_cast<char *>(buf); }
     size_t *getPos(size_t idx, size_t stock_count, size_t window_size) const {
         assert(stock_count % 4 == 0);
-        return (size_t *)(buf + sizeof(T) * stock_count * window_size +
+        return (size_t *)(const_cast<char *>(buf) +
+                          sizeof(T) * stock_count * window_size +
                           idx * sizeof(size_t));
     }
     static size_t getBufferSize(size_t stock_count, size_t window_size,
@@ -44,7 +45,7 @@ struct StreamBuffer {
         pos += 1;
         pos = (pos >= window_size) ? 0 : pos;
         size_t *posbase = getPos(0, stock_count, window_size);
-        for (int i = 0; i < divideAndCeil(stock_count, simd_len); i++) {
+        for (size_t i = 0; i < divideAndCeil(stock_count, simd_len); i++) {
             posbase[i] = pos;
         }
         return ret;
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 1bb0cc8..f9a3b6f 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -120,11 +120,11 @@ using ChunkCtxMap = llvm::DenseMap<Operation *, ChunkContext>;
 //===----------------------------------------------------------------------===//
 
 static Value emitStockId(OpBuilder &b, Location loc, Type idxTy) {
-  Value tid  = b.create<gpu::ThreadIdOp>(loc, idxTy, gpu::Dimension::x);
-  Value bid  = b.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::x);
-  Value bdim = b.create<gpu::BlockDimOp>(loc, idxTy, gpu::Dimension::x);
-  return b.create<arith::AddIOp>(
-      loc, b.create<arith::MulIOp>(loc, bid, bdim), tid);
+  Value tid  = gpu::ThreadIdOp::create(b, loc, idxTy, gpu::Dimension::x);
+  Value bid  = gpu::BlockIdOp::create(b, loc, idxTy, gpu::Dimension::x);
+  Value bdim = gpu::BlockDimOp::create(b, loc, idxTy, gpu::Dimension::x);
+  return arith::AddIOp::create(
+      b, loc, arith::MulIOp::create(b, loc, bid, bdim), tid);
 }
 
 //===----------------------------------------------------------------------===//
@@ -170,8 +170,8 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
   // Build gpu.func right before the kunir.func — both live inside the
   // enclosing gpu.module.
   OpBuilder b(fn);
-  auto newFunc = b.create<gpu::GPUFuncOp>(
-      loc, fn.getSymName(), FunctionType::get(ctx, newArgTypes, {}));
+  auto newFunc = gpu::GPUFuncOp::create(
+      b, loc, fn.getSymName(), FunctionType::get(ctx, newArgTypes, {}));
   // Mark as a kernel (sets the op-level `kernel` attribute) so that
   // convert-gpu-to-nvvm tags the resulting llvm.func with `nvvm.kernel`.
   newFunc.setKernelAttr(UnitAttr::get(ctx));
@@ -196,7 +196,7 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
   newFunc.walk([&](kunir::ReturnOp r) { returns.push_back(r); });
   for (kunir::ReturnOp r : returns) {
     OpBuilder rb(r);
-    rb.create<gpu::ReturnOp>(r.getLoc());
+    gpu::ReturnOp::create(rb, r.getLoc());
     r.erase();
   }
   fn.erase();
@@ -222,11 +222,11 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
   OpBuilder pb(ctx);
   pb.setInsertionPointToStart(&entry);
   Value sidIdx = emitStockId(pb, loc, idxTy);
-  Value sidI32 = pb.create<arith::IndexCastOp>(loc, i32Ty, sidIdx);
+  Value sidI32 = arith::IndexCastOp::create(pb, loc, i32Ty, sidIdx);
   Value numStocks = entry.getArgument(1); // i32
-  Value active = pb.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
+  Value active = arith::CmpIOp::create(pb, loc, arith::CmpIPredicate::slt,
                                             sidI32, numStocks);
-  auto ifOp = pb.create<scf::IfOp>(loc, /*resultTypes=*/TypeRange{},
+  auto ifOp = scf::IfOp::create(pb, loc, /*resultTypes=*/TypeRange{},
                                      active, /*withElseRegion=*/false);
 
   // Move all original ops (everything between the prologue we just
@@ -252,7 +252,7 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
 static Value getNumStocksI64(OpBuilder &b, Operation *op, Location loc) {
   Value ns32 = op->getParentOfType<gpu::GPUFuncOp>()
                    .getBody().front().getArgument(1);
-  return b.create<arith::ExtSIOp>(loc, b.getI64Type(), ns32);
+  return arith::ExtSIOp::create(b, loc, b.getI64Type(), ns32);
 }
 static Value getCurrentTimeIdx(Operation *op) {
   auto fOp = op->getParentOfType<scf::ForOp>();
@@ -264,14 +264,14 @@ static Value gmemGEPWithOffset(OpBuilder &b, Location loc, Type elemTy,
                                 LLVM::LLVMPointerType ptrTy, Value basePt,
                                 Value timeIdx, Value offsetIdx,
                                 Value numStocksI64, Type idxTy, Type i64Ty) {
-  Value effIdx = offsetIdx ? b.create<arith::SubIOp>(loc, timeIdx, offsetIdx).getResult()
+  Value effIdx = offsetIdx ? arith::SubIOp::create(b, loc, timeIdx, offsetIdx).getResult()
                             : timeIdx;
-  Value tI64   = b.create<arith::IndexCastOp>(loc, i64Ty, effIdx);
+  Value tI64   = arith::IndexCastOp::create(b, loc, i64Ty, effIdx);
   Value sid    = emitStockId(b, loc, idxTy);
-  Value sidI64 = b.create<arith::IndexCastOp>(loc, i64Ty, sid);
-  Value lin    = b.create<arith::AddIOp>(
-      loc, b.create<arith::MulIOp>(loc, tI64, numStocksI64), sidI64);
-  return b.create<LLVM::GEPOp>(loc, ptrTy, elemTy, basePt, ValueRange{lin});
+  Value sidI64 = arith::IndexCastOp::create(b, loc, i64Ty, sid);
+  Value lin    = arith::AddIOp::create(
+      b, loc, arith::MulIOp::create(b, loc, tI64, numStocksI64), sidI64);
+  return LLVM::GEPOp::create(b, loc, ptrTy, elemTy, basePt, ValueRange{lin});
 }
 
 //===----------------------------------------------------------------------===//
@@ -307,15 +307,15 @@ struct TimeLbPattern : OpConversionPattern<TimeLbOp> {
     auto fn = op->getParentOfType<gpu::GPUFuncOp>();
     Value chunkSize = fn.getBody().front().getArgument(3);
     Value warmup    = fn.getBody().front().getArgument(4);
-    Value cyIdx = rewriter.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::y);
-    Value cy = rewriter.create<arith::IndexCastOp>(loc, i32Ty, cyIdx);
-    Value c0 = rewriter.create<arith::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(0));
-    Value isFirst = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, cy, c0);
-    Value off = rewriter.create<arith::MulIOp>(loc, cy, chunkSize);
-    Value offMinusW = rewriter.create<arith::SubIOp>(loc, off, warmup);
-    Value lbI32 = rewriter.create<arith::SelectOp>(loc, isFirst, c0, offMinusW);
+    Value cyIdx = gpu::BlockIdOp::create(rewriter, loc, idxTy, gpu::Dimension::y);
+    Value cy = arith::IndexCastOp::create(rewriter, loc, i32Ty, cyIdx);
+    Value c0 = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value isFirst = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::eq, cy, c0);
+    Value off = arith::MulIOp::create(rewriter, loc, cy, chunkSize);
+    Value offMinusW = arith::SubIOp::create(rewriter, loc, off, warmup);
+    Value lbI32 = arith::SelectOp::create(rewriter, loc, isFirst, c0, offMinusW);
     rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, idxTy, lbI32);
     return success();
   }
@@ -335,13 +335,13 @@ struct TimeUbPattern : OpConversionPattern<TimeUbOp> {
     auto fn = op->getParentOfType<gpu::GPUFuncOp>();
     Value timeLen   = fn.getBody().front().getArgument(0);
     Value chunkSize = fn.getBody().front().getArgument(3);
-    Value cyIdx = rewriter.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::y);
-    Value cy = rewriter.create<arith::IndexCastOp>(loc, i32Ty, cyIdx);
-    Value c1 = rewriter.create<arith::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(1));
-    Value next = rewriter.create<arith::AddIOp>(loc, cy, c1);
-    Value end = rewriter.create<arith::MulIOp>(loc, next, chunkSize);
-    Value ubI32 = rewriter.create<arith::MinUIOp>(loc, end, timeLen);
+    Value cyIdx = gpu::BlockIdOp::create(rewriter, loc, idxTy, gpu::Dimension::y);
+    Value cy = arith::IndexCastOp::create(rewriter, loc, i32Ty, cyIdx);
+    Value c1 = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value next = arith::AddIOp::create(rewriter, loc, cy, c1);
+    Value end = arith::MulIOp::create(rewriter, loc, next, chunkSize);
+    Value ubI32 = arith::MinUIOp::create(rewriter, loc, end, timeLen);
     rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, idxTy, ubI32);
     return success();
   }
@@ -369,7 +369,7 @@ static Value getOrCreateMask(Operation *op, ChunkCtxMap &map,
 
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPointToStart(&fn.getBody().front());
-  ctx.mask = rewriter.create<arith::IndexCastOp>(loc, rewriter.getIndexType(),
+  ctx.mask = arith::IndexCastOp::create(rewriter, loc, rewriter.getIndexType(),
                                                     maskI32);
   return ctx.mask;
 }
@@ -394,15 +394,15 @@ static Value getOrCreateWriteStart(Operation *op, ChunkCtxMap &map,
   rewriter.setInsertionPointToStart(&entry);
   auto i32Ty = rewriter.getI32Type();
   auto idxTy = rewriter.getIndexType();
-  Value cyIdx = rewriter.create<gpu::BlockIdOp>(loc, idxTy, gpu::Dimension::y);
-  Value cy = rewriter.create<arith::IndexCastOp>(loc, i32Ty, cyIdx);
-  Value c0 = rewriter.create<arith::ConstantOp>(
-      loc, i32Ty, rewriter.getI32IntegerAttr(0));
-  Value isFirst = rewriter.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::eq, cy, c0);
-  Value off = rewriter.create<arith::MulIOp>(loc, cy, chunkSizeI32);
-  Value wsI32 = rewriter.create<arith::SelectOp>(loc, isFirst, maskI32, off);
-  ctx.writeStart = rewriter.create<arith::IndexCastOp>(loc, idxTy, wsI32);
+  Value cyIdx = gpu::BlockIdOp::create(rewriter, loc, idxTy, gpu::Dimension::y);
+  Value cy = arith::IndexCastOp::create(rewriter, loc, i32Ty, cyIdx);
+  Value c0 = arith::ConstantOp::create(
+      rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+  Value isFirst = arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::eq, cy, c0);
+  Value off = arith::MulIOp::create(rewriter, loc, cy, chunkSizeI32);
+  Value wsI32 = arith::SelectOp::create(rewriter, loc, isFirst, maskI32, off);
+  ctx.writeStart = arith::IndexCastOp::create(rewriter, loc, idxTy, wsI32);
   return ctx.writeStart;
 }
 
@@ -480,32 +480,32 @@ struct WindowedTempPattern : OpConversionPattern<WindowedTempOp> {
         OpBuilder::InsertionGuard g(rewriter);
         Block *modBody = &gpuModule.getBodyRegion().front();
         rewriter.setInsertionPoint(modBody, modBody->begin());
-        rewriter.create<LLVM::GlobalOp>(
-            loc, LLVM::LLVMArrayType::get(elemTy, N * blockSize), false,
+        LLVM::GlobalOp::create(
+            rewriter, loc, LLVM::LLVMArrayType::get(elemTy, N * blockSize), false,
             LLVM::Linkage::Internal, name, Attribute{}, 0, 3);
       }
-      Value raw = rewriter.create<LLVM::AddressOfOp>(
-          loc, LLVM::LLVMPointerType::get(ctx, 3), name);
-      Value gen    = rewriter.create<LLVM::AddrSpaceCastOp>(loc, ptrTy, raw);
-      Value tid    = rewriter.create<gpu::ThreadIdOp>(loc, idxTy, gpu::Dimension::x);
-      Value tidI32 = rewriter.create<arith::IndexCastOp>(loc, i32Ty, tid);
+      Value raw = LLVM::AddressOfOp::create(
+          rewriter, loc, LLVM::LLVMPointerType::get(ctx, 3), name);
+      Value gen    = LLVM::AddrSpaceCastOp::create(rewriter, loc, ptrTy, raw);
+      Value tid    = gpu::ThreadIdOp::create(rewriter, loc, idxTy, gpu::Dimension::x);
+      Value tidI32 = arith::IndexCastOp::create(rewriter, loc, i32Ty, tid);
       // bufPtr = smem + tid  (slot-major: slot j thread t lives at j*K + t)
-      bufPtr = rewriter.create<LLVM::GEPOp>(loc, ptrTy, elemTy, gen,
+      bufPtr = LLVM::GEPOp::create(rewriter, loc, ptrTy, elemTy, gen,
                                              ValueRange{tidI32});
     } else {
       stride = 1;
-      Value nCst = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(N));
-      bufPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, elemTy, nCst);
+      Value nCst = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      bufPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, elemTy, nCst);
     }
 
     // Single i32 cell tracking next-writable position; init to 0.
-    Value c1_i32 = rewriter.create<LLVM::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(1));
-    Value posPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, i32Ty, c1_i32);
-    Value zeroI32 = rewriter.create<LLVM::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(0));
-    rewriter.create<LLVM::StoreOp>(loc, zeroI32, posPtr);
+    Value c1_i32 = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value posPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, i32Ty, c1_i32);
+    Value zeroI32 = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    LLVM::StoreOp::create(rewriter, loc, zeroI32, posPtr);
 
     // Side state, keyed on the original (pre-replacement) ts Value.
     descMap[op.getResult()] = {posPtr, stride};
@@ -548,12 +548,12 @@ struct AccumulatorPattern : OpConversionPattern<kungpu::AccumulatorOp> {
       OpBuilder::InsertionGuard g(rewriter);
       Block &entry = fn.getBody().front();
       rewriter.setInsertionPointToStart(&entry);
-      Value c1_i32 = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(1));
-      bufPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, elemTy, c1_i32);
-      Value zero = rewriter.create<LLVM::ConstantOp>(
-          loc, elemTy, rewriter.getZeroAttr(elemTy));
-      rewriter.create<LLVM::StoreOp>(loc, zero, bufPtr);
+      Value c1_i32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      bufPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, elemTy, c1_i32);
+      Value zero = LLVM::ConstantOp::create(
+          rewriter, loc, elemTy, rewriter.getZeroAttr(elemTy));
+      LLVM::StoreOp::create(rewriter, loc, zero, bufPtr);
     }
 
     // posPtr = null → ts.get / ts.put treat as accumulator (slot 0 only).
@@ -568,9 +568,9 @@ static Value applyStride(OpBuilder &b, Location loc, Value idx, int64_t stride,
                           Type i32Ty) {
   if (stride == 1)
     return idx;
-  Value k = b.create<LLVM::ConstantOp>(loc, i32Ty,
+  Value k = LLVM::ConstantOp::create(b, loc, i32Ty,
                                         b.getI32IntegerAttr(stride));
-  return b.create<LLVM::MulOp>(loc, idx, k);
+  return LLVM::MulOp::create(b, loc, idx, k);
 }
 
 struct TsGetPattern : OpConversionPattern<TsGetOp> {
@@ -615,32 +615,32 @@ struct TsGetPattern : OpConversionPattern<TsGetOp> {
       //   return buf[idx * stride]
       int64_t N = static_cast<int64_t>(
           llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
-      Value pos    = rewriter.create<LLVM::LoadOp>(loc, i32Ty, desc.posPtr);
-      Value c1     = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(1));
-      Value nCst   = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(N));
-      Value adj    = rewriter.create<LLVM::AddOp>(loc, offsetI32, c1);
-      Value cmp    = rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::uge,
+      Value pos    = LLVM::LoadOp::create(rewriter, loc, i32Ty, desc.posPtr);
+      Value c1     = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value nCst   = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      Value adj    = LLVM::AddOp::create(rewriter, loc, offsetI32, c1);
+      Value cmp    = LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::uge,
                                                     pos, adj);
-      Value posMinusAdj = rewriter.create<LLVM::SubOp>(loc, pos, adj);
-      Value posPlusN    = rewriter.create<LLVM::AddOp>(loc, pos, nCst);
-      Value wrapped     = rewriter.create<LLVM::SubOp>(loc, posPlusN, adj);
-      Value idx32       = rewriter.create<LLVM::SelectOp>(
-          loc, cmp, posMinusAdj, wrapped);
+      Value posMinusAdj = LLVM::SubOp::create(rewriter, loc, pos, adj);
+      Value posPlusN    = LLVM::AddOp::create(rewriter, loc, pos, nCst);
+      Value wrapped     = LLVM::SubOp::create(rewriter, loc, posPlusN, adj);
+      Value idx32       = LLVM::SelectOp::create(
+          rewriter, loc, cmp, posMinusAdj, wrapped);
       // LLVM GEP accepts any integer index type — keep it i32 to avoid the
       // 64-bit ops that are slow on GPUs.
       Value gepIdx = applyStride(rewriter, loc, idx32, desc.stride, i32Ty);
-      Value gep = rewriter.create<LLVM::GEPOp>(
-          loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
+      Value gep = LLVM::GEPOp::create(
+          rewriter, loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
       rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
     } else {
       // ── global ts (function arg, TxS layout) ──────────────────────
       //   effective time = (enclosing scf.for iv) − offset
       //   load gmem[effTime * num_stocks + stock_id]
       Value timeIdx = getCurrentTimeIdx(op);
-      Value offsetIdx = rewriter.create<arith::IndexCastOp>(
-          loc, idxTy, offsetI32);
+      Value offsetIdx = arith::IndexCastOp::create(
+          rewriter, loc, idxTy, offsetI32);
       Value gep = gmemGEPWithOffset(rewriter, loc, elemTy, ptrTy, tsPtr,
                                      timeIdx, offsetIdx,
                                      getNumStocksI64(rewriter, op, loc),
@@ -678,7 +678,7 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
       const WTDesc &desc = it->second;
       // ── accumulator: single-slot store, no pos counter to advance. ─
       if (!desc.posPtr) {
-        rewriter.create<LLVM::StoreOp>(loc, v, tsPtr);
+        LLVM::StoreOp::create(rewriter, loc, v, tsPtr);
         rewriter.eraseOp(op);
         return success();
       }
@@ -687,25 +687,25 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
       //   pos = (pos + 1 >= N) ? 0 : pos + 1
       int64_t N = static_cast<int64_t>(
           llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
-      Value pos = rewriter.create<LLVM::LoadOp>(loc, i32Ty, desc.posPtr);
+      Value pos = LLVM::LoadOp::create(rewriter, loc, i32Ty, desc.posPtr);
 
       // Keep GEP index in i32 (cheap on GPU); LLVM accepts any int type.
       Value gepIdx = applyStride(rewriter, loc, pos, desc.stride, i32Ty);
-      Value gep = rewriter.create<LLVM::GEPOp>(
-          loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
-      rewriter.create<LLVM::StoreOp>(loc, v, gep);
-
-      Value c1     = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(1));
-      Value nCst   = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(N));
-      Value zero32 = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(0));
-      Value posP1  = rewriter.create<LLVM::AddOp>(loc, pos, c1);
-      Value cmp    = rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::uge,
+      Value gep = LLVM::GEPOp::create(
+          rewriter, loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
+      LLVM::StoreOp::create(rewriter, loc, v, gep);
+
+      Value c1     = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value nCst   = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      Value zero32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+      Value posP1  = LLVM::AddOp::create(rewriter, loc, pos, c1);
+      Value cmp    = LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::uge,
                                                     posP1, nCst);
-      Value newPos = rewriter.create<LLVM::SelectOp>(loc, cmp, zero32, posP1);
-      rewriter.create<LLVM::StoreOp>(loc, newPos, desc.posPtr);
+      Value newPos = LLVM::SelectOp::create(rewriter, loc, cmp, zero32, posP1);
+      LLVM::StoreOp::create(rewriter, loc, newPos, desc.posPtr);
       rewriter.eraseOp(op);
     } else {
       // ── global ts: write at current time, gated by per-chunk write_start,
@@ -722,19 +722,19 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
       Value writeStart = getOrCreateWriteStart(op, chunkCtx, rewriter);
       Value mask       = getOrCreateMask(op, chunkCtx, rewriter);
 
-      Value doWrite = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sge, timeIdx, writeStart);
-      auto ifOp = rewriter.create<scf::IfOp>(
-          loc, /*resultTypes=*/TypeRange{}, doWrite,
+      Value doWrite = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sge, timeIdx, writeStart);
+      auto ifOp = scf::IfOp::create(
+          rewriter, loc, /*resultTypes=*/TypeRange{}, doWrite,
           /*withElseRegion=*/false);
 
       OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
-      Value tOut = ib.create<arith::SubIOp>(loc, timeIdx, mask);
+      Value tOut = arith::SubIOp::create(ib, loc, timeIdx, mask);
       Value gep = gmemGEPWithOffset(ib, loc, elemTy, ptrTy, tsPtr,
                                      tOut, /*offsetIdx=*/Value(),
                                      getNumStocksI64(ib, op, loc),
                                      idxTy, i64Ty);
-      ib.create<LLVM::StoreOp>(loc, v, gep);
+      LLVM::StoreOp::create(ib, loc, v, gep);
       rewriter.eraseOp(op);
     }
     return success();
@@ -777,7 +777,6 @@ struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
     auto *ctx    = op.getContext();
     Location loc = op.getLoc();
     auto i32Ty   = rewriter.getI32Type();
-    auto idxTy   = rewriter.getIndexType();
     auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
 
     auto resultTy = op.getResult().getType();
@@ -801,30 +800,30 @@ struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
       OpBuilder::InsertionGuard g(rewriter);
       Block &entry = fn.getBody().front();
       rewriter.setInsertionPointToStart(&entry);
-      Value c1_i32 = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(1));
-      Value zeroF = rewriter.create<LLVM::ConstantOp>(
-          loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
-      Value windowI32 = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getI32IntegerAttr(window));
-
-      vPtr    = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, floatTy, c1_i32);
-      addPtr  = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, floatTy, c1_i32);
-      subPtr  = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, floatTy, c1_i32);
-      nansPtr = rewriter.create<LLVM::AllocaOp>(loc, ptrTy, i32Ty,   c1_i32);
-
-      rewriter.create<LLVM::StoreOp>(loc, zeroF,     vPtr);
-      rewriter.create<LLVM::StoreOp>(loc, zeroF,     addPtr);
-      rewriter.create<LLVM::StoreOp>(loc, zeroF,     subPtr);
-      rewriter.create<LLVM::StoreOp>(loc, windowI32, nansPtr);
+      Value c1_i32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value zeroF = LLVM::ConstantOp::create(
+          rewriter, loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
+      Value windowI32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(window));
+
+      vPtr    = LLVM::AllocaOp::create(rewriter, loc, ptrTy, floatTy, c1_i32);
+      addPtr  = LLVM::AllocaOp::create(rewriter, loc, ptrTy, floatTy, c1_i32);
+      subPtr  = LLVM::AllocaOp::create(rewriter, loc, ptrTy, floatTy, c1_i32);
+      nansPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, i32Ty,   c1_i32);
+
+      LLVM::StoreOp::create(rewriter, loc, zeroF,     vPtr);
+      LLVM::StoreOp::create(rewriter, loc, zeroF,     addPtr);
+      LLVM::StoreOp::create(rewriter, loc, zeroF,     subPtr);
+      LLVM::StoreOp::create(rewriter, loc, windowI32, nansPtr);
     }
 
     // ── 2. Read cur (off=0) and old (off=window, guarded). ─────────
-    Value zeroOff   = rewriter.create<arith::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(0));
-    Value windowOff = rewriter.create<arith::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(window));
-    Value cur = rewriter.create<TsGetOp>(loc, floatTy, origInput, zeroOff);
+    Value zeroOff   = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value windowOff = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(window));
+    Value cur = TsGetOp::create(rewriter, loc, floatTy, origInput, zeroOff);
 
     auto forOp = op->getParentOfType<scf::ForOp>();
     if (!forOp)
@@ -832,47 +831,47 @@ struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
           op, "fast_windowed_sum must be inside a scf.for time loop");
     Value timeIdx   = forOp.getInductionVar();
     Value loopLb    = forOp.getLowerBound();
-    Value localT    = rewriter.create<arith::SubIOp>(loc, timeIdx, loopLb);
-    Value windowIdx = rewriter.create<arith::ConstantIndexOp>(loc, window);
-    Value tGeWindow = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sge, localT, windowIdx);
+    Value localT    = arith::SubIOp::create(rewriter, loc, timeIdx, loopLb);
+    Value windowIdx = arith::ConstantIndexOp::create(rewriter, loc, window);
+    Value tGeWindow = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sge, localT, windowIdx);
 
-    auto ifOp = rewriter.create<scf::IfOp>(
-        loc, TypeRange{floatTy}, tGeWindow, /*withElseRegion=*/true);
+    auto ifOp = scf::IfOp::create(
+        rewriter, loc, TypeRange{floatTy}, tGeWindow, /*withElseRegion=*/true);
     {
       OpBuilder::InsertionGuard g(rewriter);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
       Value loaded =
-          rewriter.create<TsGetOp>(loc, floatTy, origInput, windowOff);
-      rewriter.create<scf::YieldOp>(loc, loaded);
+          TsGetOp::create(rewriter, loc, floatTy, origInput, windowOff);
+      scf::YieldOp::create(rewriter, loc, loaded);
     }
     {
       OpBuilder::InsertionGuard g(rewriter);
       rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
-      Value nanV = rewriter.create<LLVM::ConstantOp>(
-          loc, floatTy,
+      Value nanV = LLVM::ConstantOp::create(
+          rewriter, loc, floatTy,
           rewriter.getFloatAttr(
               floatTy, std::numeric_limits<double>::quiet_NaN()));
-      rewriter.create<scf::YieldOp>(loc, nanV);
+      scf::YieldOp::create(rewriter, loc, nanV);
     }
     Value old = ifOp.getResult(0);
 
     // ── 3. Algorithm step.  All arith is via LLVM ops at this phase. ──
     auto fcmp_isnan = [&](Value x) {
       // isnan(x) ⇔ x != x  (UNE catches NaN, == NaN is false)
-      return rewriter.create<LLVM::FCmpOp>(loc, LLVM::FCmpPredicate::une, x, x);
+      return LLVM::FCmpOp::create(rewriter, loc, LLVM::FCmpPredicate::une, x, x);
     };
     Value oldIsNan = fcmp_isnan(old);
     Value newIsNan = fcmp_isnan(cur);
 
     // Loaded state.
-    Value v       = rewriter.create<LLVM::LoadOp>(loc, floatTy, vPtr);
-    Value compAdd = rewriter.create<LLVM::LoadOp>(loc, floatTy, addPtr);
-    Value compSub = rewriter.create<LLVM::LoadOp>(loc, floatTy, subPtr);
-    Value numNans = rewriter.create<LLVM::LoadOp>(loc, i32Ty,   nansPtr);
+    Value v       = LLVM::LoadOp::create(rewriter, loc, floatTy, vPtr);
+    Value compAdd = LLVM::LoadOp::create(rewriter, loc, floatTy, addPtr);
+    Value compSub = LLVM::LoadOp::create(rewriter, loc, floatTy, subPtr);
+    Value numNans = LLVM::LoadOp::create(rewriter, loc, i32Ty,   nansPtr);
 
-    Value zeroF = rewriter.create<LLVM::ConstantOp>(
-        loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
+    Value zeroF = LLVM::ConstantOp::create(
+        rewriter, loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
 
     // kahanAdd(isnan_small, sum, small, &comp):
     //   y = small - comp;  t = sum + y;
@@ -880,49 +879,49 @@ struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
     //   comp = isnan_small ? comp : newComp;
     //   return t
     auto kahanAdd = [&](Value isnan_small, Value sum, Value small, Value &comp) {
-      Value y     = rewriter.create<LLVM::FSubOp>(loc, small, comp);
-      Value t     = rewriter.create<LLVM::FAddOp>(loc, sum, y);
-      Value tMs   = rewriter.create<LLVM::FSubOp>(loc, t, sum);
-      Value newC  = rewriter.create<LLVM::FSubOp>(loc, tMs, y);
-      comp = rewriter.create<LLVM::SelectOp>(loc, isnan_small, comp, newC);
+      Value y     = LLVM::FSubOp::create(rewriter, loc, small, comp);
+      Value t     = LLVM::FAddOp::create(rewriter, loc, sum, y);
+      Value tMs   = LLVM::FSubOp::create(rewriter, loc, t, sum);
+      Value newC  = LLVM::FSubOp::create(rewriter, loc, tMs, y);
+      comp = LLVM::SelectOp::create(rewriter, loc, isnan_small, comp, newC);
       return t;
     };
 
     // v -= old  (skip when old is NaN)
-    Value negOld = rewriter.create<LLVM::FSubOp>(loc, zeroF, old);
+    Value negOld = LLVM::FSubOp::create(rewriter, loc, zeroF, old);
     Value tSub   = kahanAdd(oldIsNan, v, negOld, compSub);
-    v = rewriter.create<LLVM::SelectOp>(loc, oldIsNan, v, tSub);
+    v = LLVM::SelectOp::create(rewriter, loc, oldIsNan, v, tSub);
 
     // v += cur  (skip when cur is NaN)
     Value tAdd   = kahanAdd(newIsNan, v, cur, compAdd);
-    v = rewriter.create<LLVM::SelectOp>(loc, newIsNan, v, tAdd);
+    v = LLVM::SelectOp::create(rewriter, loc, newIsNan, v, tAdd);
 
     // numNans += (new_is_nan ? 1 : 0) - (old_is_nan ? 1 : 0)
-    Value oneI32  = rewriter.create<LLVM::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(1));
-    Value zeroI32 = rewriter.create<LLVM::ConstantOp>(
-        loc, i32Ty, rewriter.getI32IntegerAttr(0));
-    Value oldDelta = rewriter.create<LLVM::SelectOp>(
-        loc, oldIsNan, oneI32, zeroI32);
-    Value newDelta = rewriter.create<LLVM::SelectOp>(
-        loc, newIsNan, oneI32, zeroI32);
-    numNans = rewriter.create<LLVM::SubOp>(loc, numNans, oldDelta);
-    numNans = rewriter.create<LLVM::AddOp>(loc, numNans, newDelta);
+    Value oneI32  = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value zeroI32 = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value oldDelta = LLVM::SelectOp::create(
+        rewriter, loc, oldIsNan, oneI32, zeroI32);
+    Value newDelta = LLVM::SelectOp::create(
+        rewriter, loc, newIsNan, oneI32, zeroI32);
+    numNans = LLVM::SubOp::create(rewriter, loc, numNans, oldDelta);
+    numNans = LLVM::AddOp::create(rewriter, loc, numNans, newDelta);
 
     // result = (numNans == 0) ? v : NaN
-    Value isFull = rewriter.create<LLVM::ICmpOp>(
-        loc, LLVM::ICmpPredicate::eq, numNans, zeroI32);
-    Value nanV = rewriter.create<LLVM::ConstantOp>(
-        loc, floatTy,
+    Value isFull = LLVM::ICmpOp::create(
+        rewriter, loc, LLVM::ICmpPredicate::eq, numNans, zeroI32);
+    Value nanV = LLVM::ConstantOp::create(
+        rewriter, loc, floatTy,
         rewriter.getFloatAttr(floatTy,
                                 std::numeric_limits<double>::quiet_NaN()));
-    Value out = rewriter.create<LLVM::SelectOp>(loc, isFull, v, nanV);
+    Value out = LLVM::SelectOp::create(rewriter, loc, isFull, v, nanV);
 
     // ── 4. Store back state. ────────────────────────────────────────
-    rewriter.create<LLVM::StoreOp>(loc, v,       vPtr);
-    rewriter.create<LLVM::StoreOp>(loc, compAdd, addPtr);
-    rewriter.create<LLVM::StoreOp>(loc, compSub, subPtr);
-    rewriter.create<LLVM::StoreOp>(loc, numNans, nansPtr);
+    LLVM::StoreOp::create(rewriter, loc, v,       vPtr);
+    LLVM::StoreOp::create(rewriter, loc, compAdd, addPtr);
+    LLVM::StoreOp::create(rewriter, loc, compSub, subPtr);
+    LLVM::StoreOp::create(rewriter, loc, numNans, nansPtr);
 
     rewriter.replaceOp(op, out);
     return success();
@@ -957,7 +956,7 @@ struct ConvertKunGpuToLLVMPass
     });
     auto materialize = [](OpBuilder &b, Type t, ValueRange vs, Location l) -> Value {
       if (vs.size() != 1) return Value();
-      return b.create<UnrealizedConversionCastOp>(l, t, vs).getResult(0);
+      return UnrealizedConversionCastOp::create(b, l, t, vs).getResult(0);
     };
     typeConv.addSourceMaterialization(materialize);
     typeConv.addTargetMaterialization(materialize);
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 3f5d6d6..a9b520b 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -457,22 +457,22 @@ void ForEachBackWindowOp::print(OpAsmPrinter &printer) {
 //===----------------------------------------------------------------------===//
 
 Value AddOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::AddFOp>(loc, lhs, rhs);
+  return arith::AddFOp::create(b, loc, lhs, rhs);
 }
 Value SubOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::SubFOp>(loc, lhs, rhs);
+  return arith::SubFOp::create(b, loc, lhs, rhs);
 }
 Value MulOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::MulFOp>(loc, lhs, rhs);
+  return arith::MulFOp::create(b, loc, lhs, rhs);
 }
 Value DivOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::DivFOp>(loc, lhs, rhs);
+  return arith::DivFOp::create(b, loc, lhs, rhs);
 }
 Value MaxOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::MaximumFOp>(loc, lhs, rhs);
+  return arith::MaximumFOp::create(b, loc, lhs, rhs);
 }
 Value MinOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::MinimumFOp>(loc, lhs, rhs);
+  return arith::MinimumFOp::create(b, loc, lhs, rhs);
 }
 
 // Comparison ops: dispatch arith.cmpf for FloatType operands and
@@ -481,8 +481,8 @@ static Value buildCmpScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs,
                               arith::CmpFPredicate fp,
                               arith::CmpIPredicate ip) {
   if (llvm::isa<FloatType>(lhs.getType()))
-    return b.create<arith::CmpFOp>(loc, fp, lhs, rhs);
-  return b.create<arith::CmpIOp>(loc, ip, lhs, rhs);
+    return arith::CmpFOp::create(b, loc, fp, lhs, rhs);
+  return arith::CmpIOp::create(b, loc, ip, lhs, rhs);
 }
 Value GreaterOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
   return buildCmpScalarOp(b, loc, lhs, rhs,
@@ -507,10 +507,10 @@ Value EqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
 
 // Logical binary ops on i1.
 Value AndOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::AndIOp>(loc, lhs, rhs);
+  return arith::AndIOp::create(b, loc, lhs, rhs);
 }
 Value OrOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
-  return b.create<arith::OrIOp>(loc, lhs, rhs);
+  return arith::OrIOp::create(b, loc, lhs, rhs);
 }
 
 //===----------------------------------------------------------------------===//
@@ -518,22 +518,22 @@ Value OrOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
 //===----------------------------------------------------------------------===//
 
 Value AbsOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
-  return b.create<math::AbsFOp>(loc, operand);
+  return math::AbsFOp::create(b, loc, operand);
 }
 Value LogOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
-  return b.create<math::LogOp>(loc, operand);
+  return math::LogOp::create(b, loc, operand);
 }
 Value SignOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
   // sign(x) ≈ copysign(1.0, x)
-  Value one = b.create<arith::ConstantOp>(
-      loc, operand.getType(), b.getFloatAttr(operand.getType(), 1.0));
-  return b.create<math::CopySignOp>(loc, one, operand);
+  Value one = arith::ConstantOp::create(
+      b, loc, operand.getType(), b.getFloatAttr(operand.getType(), 1.0));
+  return math::CopySignOp::create(b, loc, one, operand);
 }
 Value NotOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
   // not(x) = x ^ 1 on i1
-  Value one = b.create<arith::ConstantOp>(loc, b.getI1Type(),
+  Value one = arith::ConstantOp::create(b, loc, b.getI1Type(),
                                             b.getIntegerAttr(b.getI1Type(), 1));
-  return b.create<arith::XOrIOp>(loc, operand, one);
+  return arith::XOrIOp::create(b, loc, operand, one);
 }
 
 //===----------------------------------------------------------------------===//
@@ -544,28 +544,28 @@ TypedAttr ReduceAddOp::getInitValue(FloatType elemType) {
   return FloatAttr::get(elemType, 0.0);
 }
 Value ReduceAddOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
-  return b.create<arith::AddFOp>(loc, acc, elem);
+  return arith::AddFOp::create(b, loc, acc, elem);
 }
 
 TypedAttr ReduceMulOp::getInitValue(FloatType elemType) {
   return FloatAttr::get(elemType, 1.0);
 }
 Value ReduceMulOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
-  return b.create<arith::MulFOp>(loc, acc, elem);
+  return arith::MulFOp::create(b, loc, acc, elem);
 }
 
 TypedAttr ReduceMaxOp::getInitValue(FloatType elemType) {
   return FloatAttr::get(elemType, -std::numeric_limits<double>::infinity());
 }
 Value ReduceMaxOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
-  return b.create<arith::MaximumFOp>(loc, acc, elem);
+  return arith::MaximumFOp::create(b, loc, acc, elem);
 }
 
 TypedAttr ReduceMinOp::getInitValue(FloatType elemType) {
   return FloatAttr::get(elemType, std::numeric_limits<double>::infinity());
 }
 Value ReduceMinOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
-  return b.create<arith::MinimumFOp>(loc, acc, elem);
+  return arith::MinimumFOp::create(b, loc, acc, elem);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index b37ef90..d369c64 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -103,7 +103,7 @@ struct LowerHelper {
       return emitError(loc,
           "kunir-to-kungpu: value is not a registered ts handle in tsMap");
     auto tsTy = llvm::cast<TsType>(v.getType());
-    return b.create<TsGetOp>(loc, tsTy.getElementType(),
+    return TsGetOp::create(b, loc, tsTy.getElementType(),
                               it->second, offsetI32).getResult();
   }
 
@@ -147,7 +147,7 @@ struct LowerHelper {
         KUN_ASSIGN_OR_FAIL(Value tv,   getScalar(sel.getTrueValue(), b, ol));
         KUN_ASSIGN_OR_FAIL(Value fv,   getScalar(sel.getFalseValue(),b, ol));
         scalarMap[sel.getResult()] =
-            b.create<arith::SelectOp>(ol, cond, tv, fv).getResult();
+            arith::SelectOp::create(b, ol, cond, tv, fv).getResult();
       } else if (auto br = dyn_cast<BackRefOp>(op)) {
         // Warmup guard:  if   t - outer_loop_lb < window  →  NaN
         //                else                            →  ts.get(window)
@@ -174,30 +174,30 @@ struct LowerHelper {
                               "warmup guard)");
 
         Value delta =
-            b.create<arith::SubIOp>(ol, outerTimeIdx, outerLoopLb);
+            arith::SubIOp::create(b, ol, outerTimeIdx, outerLoopLb);
         Value windowIdx =
-            b.create<arith::ConstantIndexOp>(ol, window);
-        Value inSteady = b.create<arith::CmpIOp>(
-            ol, arith::CmpIPredicate::sge, delta, windowIdx);
-        auto ifOp = b.create<scf::IfOp>(ol, TypeRange{floatTy}, inSteady,
+            arith::ConstantIndexOp::create(b, ol, window);
+        Value inSteady = arith::CmpIOp::create(
+            b, ol, arith::CmpIPredicate::sge, delta, windowIdx);
+        auto ifOp = scf::IfOp::create(b, ol, TypeRange{floatTy}, inSteady,
                                           /*withElseRegion=*/true);
         {
           OpBuilder ib =
               OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
-          Value offset = ib.create<arith::ConstantOp>(
-              ol, ib.getI32Type(), ib.getI32IntegerAttr(window));
+          Value offset = arith::ConstantOp::create(
+              ib, ol, ib.getI32Type(), ib.getI32IntegerAttr(window));
           KUN_ASSIGN_OR_FAIL(Value loaded,
               getScalarUncached(br.getInput(), offset, ib, ol));
-          ib.create<scf::YieldOp>(ol, loaded);
+          scf::YieldOp::create(ib, ol, loaded);
         }
         {
           OpBuilder ib =
               OpBuilder::atBlockBegin(&ifOp.getElseRegion().front());
           llvm::APFloat qnan =
               llvm::APFloat::getQNaN(floatTy.getFloatSemantics());
-          Value nanV = ib.create<arith::ConstantOp>(
-              ol, floatTy, FloatAttr::get(floatTy, qnan));
-          ib.create<scf::YieldOp>(ol, nanV);
+          Value nanV = arith::ConstantOp::create(
+              ib, ol, floatTy, FloatAttr::get(floatTy, qnan));
+          scf::YieldOp::create(ib, ol, nanV);
         }
         scalarMap[br.getResult()] = ifOp.getResult(0);
       } else if (auto co = dyn_cast<ConstantOp>(op)) {
@@ -211,8 +211,8 @@ struct LowerHelper {
           apv.convert(ft.getFloatSemantics(),
                       llvm::APFloat::rmNearestTiesToEven, &losesInfo);
         }
-        scalarMap[co.getResult()] = b.create<arith::ConstantOp>(
-            ol, elemTy, b.getFloatAttr(elemTy, apv));
+        scalarMap[co.getResult()] = arith::ConstantOp::create(
+            b, ol, elemTy, b.getFloatAttr(elemTy, apv));
       } else if (handleUnknown) {
         if (failed(handleUnknown(*op))) return failure();
       } else {
@@ -310,16 +310,16 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   // convert-kungpu-to-llvm and are read at lowering time.  When the
   // caller's launcher uses num_chunks = 1 it sets chunk_size =
   // time_length so chunk 0 covers the full range.
-  Value lb = b.create<TimeLbOp>(loc, b.getIndexType());
-  Value ub = b.create<TimeUbOp>(loc, b.getIndexType());
-  Value c0 = b.create<arith::ConstantIndexOp>(loc, 0);
-  Value c1 = b.create<arith::ConstantIndexOp>(loc, 1);
+  Value lb = TimeLbOp::create(b, loc, b.getIndexType());
+  Value ub = TimeUbOp::create(b, loc, b.getIndexType());
+  Value c0 = arith::ConstantIndexOp::create(b, loc, 0);
+  Value c1 = arith::ConstantIndexOp::create(b, loc, 1);
   // Outer-loop ts.get/put always reference the current time step, i.e.
   // tail-relative offset = 0 (i32).  Created before outerFor so it dominates
   // every use inside the loop body.
-  Value zeroOffsetI32 = b.create<arith::ConstantOp>(
-      loc, b.getI32Type(), b.getI32IntegerAttr(0));
-  auto outerFor = b.create<scf::ForOp>(loc, lb, ub, c1);
+  Value zeroOffsetI32 = arith::ConstantOp::create(
+      b, loc, b.getI32Type(), b.getI32IntegerAttr(0));
+  auto outerFor = scf::ForOp::create(b, loc, lb, ub, c1);
 
   // Erase the implicit empty scf.yield (no iter_args → zero-operand yield).
   outerFor.getBody()->back().erase();
@@ -358,11 +358,11 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     // windowed_output → allocate windowed_temp outside the loop,
     //                   fill circular buffer at each time step inside.
     if (auto woOp = dyn_cast<WindowedOutputOp>(op)) {
-      auto wt = b.create<WindowedTempOp>(ol, woOp.getResult().getType());
+      auto wt = WindowedTempOp::create(b, ol, woOp.getResult().getType());
       outer.tsMap[woOp.getResult()] = wt.getResult();
       KUN_ASSIGN_OR_FAIL(Value inputScalar,
                          outer.getScalar(woOp.getInput(), fb, ol));
-      fb.create<TsPutOp>(ol, wt.getResult(), inputScalar);
+      TsPutOp::create(fb, ol, wt.getResult(), inputScalar);
       return success();
     }
 
@@ -391,27 +391,27 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
         }
         auto elemTy = llvm::cast<FloatType>(
             llvm::cast<TsType>(defOp->getOperand(0).getType()).getElementType());
-        initVals.push_back(fb.create<arith::ConstantOp>(ol, ri.getInitValue(elemTy)));
+        initVals.push_back(arith::ConstantOp::create(fb, ol, ri.getInitValue(elemTy)));
       }
 
       // Create inner scf.for %w = 0 to window step 1 iter_args(acc_i = init_i).
       // The lambda form lets us emit a proper scf.yield as the body terminator
       // without fighting the implicit yield created by ensureTerminator.
-      Value wBound  = fb.create<arith::ConstantIndexOp>(ol, window);
-      Value wM1_i32 = fb.create<arith::ConstantOp>(
-          ol, fb.getI32Type(), fb.getI32IntegerAttr(window - 1));
+      Value wBound  = arith::ConstantIndexOp::create(fb, ol, window);
+      Value wM1_i32 = arith::ConstantOp::create(
+          fb, ol, fb.getI32Type(), fb.getI32IntegerAttr(window - 1));
 
       // Capture lowerBlock result since the lambda can't return LogicalResult.
       bool innerOk = true;
-      auto innerFor = fb.create<scf::ForOp>(
-          ol, c0, wBound, c1, initVals,
+      auto innerFor = scf::ForOp::create(
+          fb, ol, c0, wBound, c1, initVals,
           [&](OpBuilder &ib, Location il, Value w, ValueRange iterArgs) {
             // Tail-relative offset for this window step.  Iterating w from 0
             // to window-1 reads oldest-to-newest, i.e. offset = window-1-w.
             Value w_i32 =
-                ib.create<arith::IndexCastOp>(il, ib.getI32Type(), w);
+                arith::IndexCastOp::create(ib, il, ib.getI32Type(), w);
             Value windowedOffset =
-                ib.create<arith::SubIOp>(il, wM1_i32, w_i32);
+                arith::SubIOp::create(ib, il, wM1_i32, w_i32);
 
             // Inner LowerHelper inherits the outer tsMap/scalarMap so reads
             // inside the body can still reach outer-scope handles (e.g. a
@@ -434,7 +434,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
                                                 windowedOffset, ib, il);
               if (failed(r)) {
                 innerOk = false;
-                ib.create<scf::YieldOp>(il, initVals);
+                scf::YieldOp::create(ib, il, initVals);
                 return;
               }
               inner.scalarMap[arg] = *r;
@@ -444,14 +444,14 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
 
             if (failed(inner.lowerBlock(body, ib))) {
               innerOk = false;
-              ib.create<scf::YieldOp>(il, initVals); // keep IR structurally valid
+              scf::YieldOp::create(ib, il, initVals); // keep IR structurally valid
               return;
             }
 
             SmallVector<Value> newAccs;
             for (Value yv : yieldOp.getValues())
               newAccs.push_back(inner.scalarMap.find(yv)->second);
-            ib.create<scf::YieldOp>(il, newAccs);
+            scf::YieldOp::create(ib, il, newAccs);
           });
       if (!innerOk) return failure();
 
@@ -466,8 +466,8 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     // loop, like windowed_temp).  Stored in tsMap so that downstream reads
     // (via getScalar → kungpu.ts.get @ offset 0) resolve to the slot.
     if (auto acc = dyn_cast<kunir::AccumulatorOp>(op)) {
-      auto ka = b.create<kungpu::AccumulatorOp>(
-          ol, acc.getResult().getType(), acc.getNameAttr());
+      auto ka = kungpu::AccumulatorOp::create(
+          b, ol, acc.getResult().getType(), acc.getNameAttr());
       outer.tsMap[acc.getResult()] = ka.getResult();
       return success();
     }
@@ -484,10 +484,10 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
                          outer.getScalar(sa.getMask(),  fb, ol));
       KUN_ASSIGN_OR_FAIL(Value valueScalar,
                          outer.getScalar(sa.getValue(), fb, ol));
-      auto ifOp = fb.create<scf::IfOp>(ol, /*resultTypes=*/TypeRange{},
+      auto ifOp = scf::IfOp::create(fb, ol, /*resultTypes=*/TypeRange{},
                                          maskScalar, /*withElseRegion=*/false);
       OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
-      ib.create<TsPutOp>(ol, accIt->second, valueScalar);
+      TsPutOp::create(ib, ol, accIt->second, valueScalar);
       return success();
     }
 
@@ -500,8 +500,8 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       if (inputIt == outer.tsMap.end())
         return op.emitError(
             "kunir-to-kungpu: fast_windowed_sum input must be a ts handle");
-      auto newOp = fb.create<FastWindowedSumOp>(
-          ol, /*resultType=*/inputTs.getElementType(),
+      auto newOp = FastWindowedSumOp::create(
+          fb, ol, /*resultType=*/inputTs.getElementType(),
           /*input=*/inputIt->second, fws.getWindowAttr());
       outer.scalarMap[fws.getResult()] = newOp.getResult();
       return success();
@@ -520,9 +520,9 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     auto it = outer.scalarMap.find(rv);
     assert(it != outer.scalarMap.end() &&
            "ts return value not materialised as a scalar");
-    fb.create<TsPutOp>(loc, outParam, it->second);
+    TsPutOp::create(fb, loc, outParam, it->second);
   }
-  fb.create<scf::YieldOp>(loc);
+  scf::YieldOp::create(fb, loc);
 
   // ------------------------------------------------------------------
   // 7. Insert a replacement return before the original return op.
@@ -532,7 +532,7 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     SmallVector<Value> nonTsRets;
     for (Value v : retOp.getOperands())
       if (!isa<TsType>(v.getType())) nonTsRets.push_back(v);
-    b.create<kunir::ReturnOp>(loc, mlir::ValueRange(nonTsRets));
+    kunir::ReturnOp::create(b, loc, mlir::ValueRange(nonTsRets));
   }
 
   // ------------------------------------------------------------------
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 5ac3a67..8f34f74 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -45,7 +45,7 @@ class IRBuilder {
     b_.setInsertionPointToEnd(pm_->module.get().getBody());
     // One gpu.module per IRBuilder — KunMLIR's pipeline expects exactly
     // one container for all kunir.func ops.
-    gpuMod_ = b_.create<gpu::GPUModuleOp>(loc, "kungpu_kernels");
+    gpuMod_ = gpu::GPUModuleOp::create(b_, loc, "kungpu_kernels");
     b_.setInsertionPointToStart(&gpuMod_.getBodyRegion().front());
   }
 
@@ -112,9 +112,9 @@ class IRBuilder {
     auto target = kunir::TargetSpecAttr::get(ctx, occupancy, warpsPerCta,
                                                 smemSize, vectorSize);
 
-    curFunc_ = b_.create<kunir::FuncOp>(loc, name, funcType, inNamesAttr,
-                                          outNamesAttr, target,
-                                          unreliableCount);
+    curFunc_ = kunir::FuncOp::create(b_, loc, name, funcType, inNamesAttr,
+                                       outNamesAttr, target,
+                                       unreliableCount);
     Block &entry = curFunc_.getBodyBlock();
     b_.setInsertionPointToStart(&entry);
 
@@ -132,7 +132,7 @@ class IRBuilder {
           " for_each_back_window region(s) still open — close them first");
 
     Location loc = b_.getUnknownLoc();
-    b_.create<kunir::ReturnOp>(loc, ValueRange(returnValues));
+    kunir::ReturnOp::create(b_, loc, ValueRange(returnValues));
 
     // Restore insertion point to gpu.module so the next begin_func
     // appends a sibling.
@@ -164,22 +164,22 @@ class IRBuilder {
 
   // ── Select (cond, true_value, false_value) ──────────────────────
   Value selectOp(Value cond, Value tv, Value fv) {
-    return b_.create<kunir::SelectOp>(b_.getUnknownLoc(), cond, tv, fv);
+    return kunir::SelectOp::create(b_, b_.getUnknownLoc(), cond, tv, fv);
   }
 
   // ── Scalar constant lifted to ts<T, 1> ─────────────────────────
   Value constantOp(double value, Type tsTy) {
     auto attr = b_.getF64FloatAttr(value);
-    return b_.create<kunir::ConstantOp>(b_.getUnknownLoc(), tsTy, attr);
+    return kunir::ConstantOp::create(b_, b_.getUnknownLoc(), tsTy, attr);
   }
 
   // ── Accumulator / SetAccumulator ───────────────────────────────
   Value accumulatorOp(std::string name, Type tsTy) {
-    return b_.create<kunir::AccumulatorOp>(b_.getUnknownLoc(), tsTy,
+    return kunir::AccumulatorOp::create(b_, b_.getUnknownLoc(), tsTy,
                                             b_.getStringAttr(name));
   }
   void setAccumulatorOp(Value acc, Value mask, Value value) {
-    b_.create<kunir::SetAccumulatorOp>(b_.getUnknownLoc(), acc, mask, value);
+    kunir::SetAccumulatorOp::create(b_, b_.getUnknownLoc(), acc, mask, value);
   }
 
   // ── Windowed buffer materialization ───────────────────────────────
@@ -187,7 +187,7 @@ class IRBuilder {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
     auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(),
                                           static_cast<uint64_t>(length));
-    return b_.create<kunir::WindowedOutputOp>(b_.getUnknownLoc(), resultTy, x,
+    return kunir::WindowedOutputOp::create(b_, b_.getUnknownLoc(), resultTy, x,
                                                 length);
   }
 
@@ -195,12 +195,12 @@ class IRBuilder {
   Value backRefOp(Value x, int64_t window) {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
     auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(), 1);
-    return b_.create<kunir::BackRefOp>(b_.getUnknownLoc(), resultTy, x, window);
+    return kunir::BackRefOp::create(b_, b_.getUnknownLoc(), resultTy, x, window);
   }
   Value fastWindowedSumOp(Value x, int64_t window) {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
     auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(), 1);
-    return b_.create<kunir::FastWindowedSumOp>(b_.getUnknownLoc(), resultTy, x,
+    return kunir::FastWindowedSumOp::create(b_, b_.getUnknownLoc(), resultTy, x,
                                                  window);
   }
 
@@ -209,7 +209,7 @@ class IRBuilder {
   beginForEachBackWindow(std::vector<Value> inputs, int64_t window,
                             std::vector<Type> resultTypes) {
     Location loc = b_.getUnknownLoc();
-    auto loopOp = b_.create<kunir::ForEachBackWindowOp>(loc, resultTypes,
+    auto loopOp = kunir::ForEachBackWindowOp::create(b_, loc, resultTypes,
                                                           inputs, window);
     // Populate body block: one block arg per input, each ts<elemType, 1>.
     Block *body = new Block;
@@ -234,7 +234,7 @@ class IRBuilder {
       throw std::runtime_error(
           "IRBuilder.end_for_each_back_window: no open loop");
     Location loc = b_.getUnknownLoc();
-    b_.create<kunir::YieldOp>(loc, ValueRange(yieldValues));
+    kunir::YieldOp::create(b_, loc, ValueRange(yieldValues));
 
     auto loopOp = loopStack_.back();
     loopStack_.pop_back();
@@ -272,14 +272,14 @@ class IRBuilder {
 
 private:
   template <typename OpTy> Value makeBin(Value a, Value b) {
-    return b_.create<OpTy>(b_.getUnknownLoc(), a, b);
+    return OpTy::create(b_, b_.getUnknownLoc(), a, b);
   }
   template <typename OpTy> Value makeUn(Value x) {
-    return b_.create<OpTy>(b_.getUnknownLoc(), x);
+    return OpTy::create(b_, b_.getUnknownLoc(), x);
   }
   template <typename OpTy> Value makeReduce(Value x) {
     // SameOperandsAndResultType — pass x's type as the result type.
-    return b_.create<OpTy>(b_.getUnknownLoc(), x.getType(), x);
+    return OpTy::create(b_, b_.getUnknownLoc(), x.getType(), x);
   }
 
   std::unique_ptr<PyModule> pm_;

From 45a2d869f07bc3ddabb2b522656d4164026bee6a Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 15 May 2026 02:37:30 -0700
Subject: [PATCH 29/59] align with cpu rungraph and compileit. Fix mask time
 length

---
 KunQuant/jit/cuda.py                 | 228 ++++++++++++++-------
 mlir/lib/KunGpu/KunGpuToLLVM.cpp     |  69 ++-----
 mlir/lib/Python/CMakeLists.txt       |  11 +
 mlir/lib/Python/MlirBinding.cpp      | 295 +++++++++++++++++++--------
 mlir/test/python/test_kun_to_cuda.py | 215 ++++++++++++-------
 5 files changed, 537 insertions(+), 281 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 52aebd5..53ac70f 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -5,32 +5,49 @@
 list (`Driver.optimize`) so any IR rewrites the CPU path benefits from
 also apply here — only the codegen layer is replaced.
 
-User entry point::
+Two-tier config split matches the CPU path:
+
+  * Per-Function knobs live in `KunCompilerConfig` (the CPU-shared
+    dataclass): `dtype`, `blocking_len`, `partition_factor`,
+    `input_layout` / `output_layout` (TS only on GPU), `options`.
+  * Compile-/link-time knobs live in `CudaCompilerConfig`: `gpu_arch`,
+    `warps_per_cta`, `smem_size`, `occupancy`, `opt_level`,
+    `toolkit_path`.  Shared across every Function in a `Library`.
+
+Single-Function compile::
 
     from KunQuant.jit import KunMLIR
-    from KunQuant.jit.cuda import compileit, CudaCompilerConfig
+    from KunQuant.jit.cuda import compile_func, CudaCompilerConfig
+    from KunQuant.Driver import KunCompilerConfig
 
-    exe = compileit(f, CudaCompilerConfig(gpu_arch="sm_80"))
+    exe = compile_func(f,
+                        KunCompilerConfig(input_layout="TS",
+                                            output_layout="TS"),
+                        CudaCompilerConfig(gpu_arch="sm_80"))
     executor = KunMLIR.Executor()                       # default stream
-    executor.runGraph(exe, {"a": cp_a, "b": cp_b, "out": cp_out})
+    out = executor.runGraph(exe, {"a": cp_a, "b": cp_b})  # length auto-inferred
     executor.synchronize()
 
-Scope (v0):
-  * Single Function in, single kunir.func out.  Multi-Function /
-    auto-partition support is future work.
-  * dtype = "float" only (kunir lowers f32 today).
-  * Layout is implicit: kunir uses the TS-major layout exposed by the
-    runtime (see KunCuda/Runtime.h).
+Multi-Function compile (CPU `cfake.compileit` shape)::
+
+    from KunQuant.jit.cuda import compileit, CudaCompilerConfig
+    from KunQuant.Driver import KunCompilerConfig
+
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS")
+    ccfg = CudaCompilerConfig(gpu_arch="sm_80")
+    lib = compileit([("mod1", f1, kcfg), ("mod2", f2, kcfg)],
+                     "my_lib", ccfg)
+    exe = lib.getModule("mod1")
 """
 
 from __future__ import annotations
 import os
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Tuple
 
 from KunQuant.jit import KunMLIR
 
-from KunQuant.Driver import optimize, post_optimize
+from KunQuant.Driver import KunCompilerConfig, optimize, post_optimize
 from KunQuant.Op import Input, Output, MayRequireWholeTime
 from KunQuant.passes import do_partition
 from KunQuant.passes.InferWindow import infer_window
@@ -96,21 +113,20 @@ def find_cuda_toolkit(override: str = "") -> str:
 
 @dataclass
 class CudaCompilerConfig:
-    """Mirrors the parts of KunCompilerConfig that matter for GPU.
-
-    `dtype`, `gpu_arch`, and the kunir target_spec fields are the only
-    knobs we actually expose.  The CPU-only fields (blocking_len,
-    input_layout, etc.) deliberately do not appear here — they're not
-    meaningful for the GPU path.
+    """Compile- / link-time knobs that are shared across every Function
+    in a `Library`.  Per-Function graph-rewriting knobs (dtype,
+    blocking_len, partition_factor, layout, pass options) live in
+    `KunQuant.Driver.KunCompilerConfig` instead — the same dataclass
+    the CPU path uses.
     """
     gpu_arch:    str = "sm_80"
-    dtype:         str = "float"   # only "float" supported in v0
 
-    # kunir.target_spec — graph-wide for v0.
+    # kunir.target_spec — graph-wide for v0.  `vector_size` is taken
+    # from the per-Function `KunCompilerConfig.blocking_len` at compile
+    # time (the two are the same concept on GPU).
     occupancy:     int = 1
     warps_per_cta: int = 4
     smem_size:     int = 49152
-    vector_size:   int = 1
 
     # LLVM optimization level (forwarded to #nvvm.target<O = ...>).
     opt_level:     int  = 3
@@ -118,47 +134,54 @@ class CudaCompilerConfig:
     # Empty → upstream search: CUDA_HOME / CUDA_PATH / standard locations.
     toolkit_path:  str  = ""
 
-    # Forwarded to `do_partition` — same default as KunCompilerConfig.
-    # Larger factor ⇒ coarser partitions (fewer, bigger kernels).  After
-    # partition each sub-Function becomes one kunir.func inside the
-    # generated gpu.module; intermediate buffers between them are
-    # auto-managed by the runtime's slot pool.
-    partition_factor: int = 3
 
-    # Pass-list options forwarded to optimize().  We seed reasonable GPU
-    # defaults; user-supplied keys override.
-    options:       Optional[dict] = None
+def _resolve_vector_size(kcfg: KunCompilerConfig) -> int:
+    """On GPU `vector_size` (kunir target_spec) is the same as
+    `blocking_len` from the per-Function config.  Default to 1 (scalar
+    kunir) if the user didn't specify."""
+    return 1 if kcfg.blocking_len is None else int(kcfg.blocking_len)
 
 
-def _gpu_pass_options(cfg: CudaCompilerConfig) -> dict:
-    """Defaults for `Driver.optimize`'s `options` dict on the GPU path.
+def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
+    """`Driver.optimize`'s `options` dict for the GPU path.
 
-    The CPU compileit() does the same kind of seeding — we replicate the
-    bits that affect graph rewriting.  `blocking_len` is needed by some
-    decompose paths (skip-list cutoff in WindowedMin/Max); we feed it
-    `warps_per_cta * 32 * vector_size`, which matches the GPU's
-    stocks-per-block.
+    `blocking_len` is needed by some decompose paths (skip-list cutoff
+    in WindowedMin/Max).  Everything else — including `no_fast_stat` —
+    is taken verbatim from `kcfg.options`; we do not force `no_fast_stat`
+    here.  If the user wants the GPU-safe default, they should set
+    `no_fast_stat=True` in `kcfg.options` themselves.
     """
-    opts: dict = {
-        "blocking_len":   cfg.warps_per_cta * 32 * cfg.vector_size,
-        # Fast-stat tricks rely on running stats / FMA orderings that
-        # don't map cleanly onto the GPU primitives we lower today.
-        # Keep it off until the corresponding kunir lowerings exist.
-        "no_fast_stat":   True,
-    }
-    if cfg.options:
-        opts.update(cfg.options)
+    opts: dict = {"blocking_len": _resolve_vector_size(kcfg)}
+    if kcfg.options:
+        opts.update(kcfg.options)
     return opts
 
 
 def _to_dtype_token(dtype: str) -> str:
     if dtype == "float":  return "f32"
     if dtype == "double": return "f64"
-    raise ValueError(f"compile_to_cuda: unsupported dtype '{dtype}' "
+    raise ValueError(f"compile_func: unsupported dtype '{dtype}' "
                        f"(supported: float, double — kunir today only "
                        f"lowers float on GPU)")
 
 
+def _validate_kun_cfg(kcfg: KunCompilerConfig) -> None:
+    """GPU path only supports TS layout on both input and output (kunir
+    runtime is TS-major).  dtype must be a kunir-supported token."""
+    if kcfg.input_layout != "TS":
+        raise ValueError(
+            f"GPU backend only supports input_layout='TS', got "
+            f"{kcfg.input_layout!r}")
+    if kcfg.output_layout != "TS":
+        raise ValueError(
+            f"GPU backend only supports output_layout='TS', got "
+            f"{kcfg.output_layout!r}")
+    if kcfg.dtype not in ("float", "double"):
+        raise ValueError(
+            f"KunCompilerConfig.dtype must be 'float' or 'double', got "
+            f"{kcfg.dtype!r}")
+
+
 def _graph_io_names(f: Function):
     """User-facing graph inputs/outputs.  Captured BEFORE optimize +
     do_partition because those passes mutate `f` and may scatter the
@@ -168,13 +191,13 @@ def _graph_io_names(f: Function):
     ins  = [op.attrs["name"] for op in f.ops if isinstance(op, Input)]
     outs = [op.attrs["name"] for op in f.ops if isinstance(op, Output)]
     if not ins:
-        raise ValueError("CudaCompilerConfig: function has no Input ops")
+        raise ValueError("compile_func: function has no Input ops")
     if not outs:
-        raise ValueError("CudaCompilerConfig: function has no Output ops")
+        raise ValueError("compile_func: function has no Output ops")
     return ins, outs
 
 
-def _run_full_pipeline(f: Function, cfg: CudaCompilerConfig):
+def _run_full_pipeline(f: Function, kcfg: KunCompilerConfig):
     """Same pass pipeline the CPU `compileit` runs:
 
         optimize  →  do_partition  →  post_optimize
@@ -182,14 +205,15 @@ def _run_full_pipeline(f: Function, cfg: CudaCompilerConfig):
     Returns the list of post-partition Functions that the translator
     should walk (one kunir.func per Function).  Mutates `f` in place.
     """
-    options = _gpu_pass_options(cfg)
+    options = _gpu_pass_options(kcfg)
     optimize(f, options)
-    _mainf, impl = do_partition(f, cfg.partition_factor, options)
+    _mainf, impl = do_partition(f, kcfg.partition_factor, options)
     post_optimize(impl, options)
     return impl
 
 
-def _translate_partitions(impl, cfg: CudaCompilerConfig):
+def _translate_partitions(impl, kcfg: KunCompilerConfig,
+                            ccfg: CudaCompilerConfig):
     """Emit one kunir.func per partitioned Function into a single
     KunMLIR module (single `gpu.module` with N siblings).  Cross-
     partition buffers stitch up automatically because each impl's
@@ -205,12 +229,12 @@ def _translate_partitions(impl, cfg: CudaCompilerConfig):
     Returns (ModuleOp, list[dict]) — the second element is the list
     of external-kernel descriptors to forward to KunMLIR.compile.
     """
-    target = TargetSpec(occupancy=cfg.occupancy,
-                          warps_per_cta=cfg.warps_per_cta,
-                          smem_size=cfg.smem_size,
-                          vector_size=cfg.vector_size)
+    target = TargetSpec(occupancy=ccfg.occupancy,
+                          warps_per_cta=ccfg.warps_per_cta,
+                          smem_size=ccfg.smem_size,
+                          vector_size=_resolve_vector_size(kcfg))
     ir = KunMLIR.IRBuilder()
-    dtype = _to_dtype_token(cfg.dtype)
+    dtype = _to_dtype_token(kcfg.dtype)
     externals = []
     for sub in impl:
         # Per-partition warmup: max windowed-chain depth from any input
@@ -235,8 +259,9 @@ def _translate_partitions(impl, cfg: CudaCompilerConfig):
     return ir.finish(), externals
 
 
-def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
-    """Compile a KunQuant Function to a GPU `KunMLIR.Executable`.
+def compile_func(f: Function, kcfg: KunCompilerConfig,
+                   ccfg: CudaCompilerConfig) -> KunMLIR.Executable:
+    """Compile a single KunQuant Function to a GPU `KunMLIR.Executable`.
 
     Pipeline mirrors `KunQuant.jit.cfake.compileit` on the CPU path:
 
@@ -249,23 +274,20 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
       6. Hand off to KunMLIR.compile, which generates the cubin and
          resolves cross-kernel data flow via I/O names.
     """
-    if cfg.dtype not in ("float", "double"):
-        raise ValueError(
-            f"CudaCompilerConfig.dtype must be 'float' or 'double', got "
-            f"{cfg.dtype!r}")
+    _validate_kun_cfg(kcfg)
 
-    toolkit_path = find_cuda_toolkit(cfg.toolkit_path)
+    toolkit_path = find_cuda_toolkit(ccfg.toolkit_path)
 
     graph_inputs, graph_outputs = _graph_io_names(f)
-    impl = _run_full_pipeline(f, cfg)
-    mod, externals = _translate_partitions(impl, cfg)
+    impl = _run_full_pipeline(f, kcfg)
+    mod, externals = _translate_partitions(impl, kcfg, ccfg)
 
     return KunMLIR.compile(
         mod,
         graph_inputs=graph_inputs,
         graph_outputs=graph_outputs,
-        gpu_arch=cfg.gpu_arch,
-        opt_level=cfg.opt_level,
+        gpu_arch=ccfg.gpu_arch,
+        opt_level=ccfg.opt_level,
         toolkit_path=toolkit_path,
         external_kernels=externals,
         # Forwarded for the no-JIT-kernel case: when every partition
@@ -273,17 +295,71 @@ def compileit(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.Executable:
         # MLIR module is empty and `data.warpsPerCta` would otherwise
         # default to 1 — but the cs_rank launch uses it to size
         # blockDim, so feed the config value through.
-        warps_per_cta=cfg.warps_per_cta,
+        warps_per_cta=ccfg.warps_per_cta,
     )
 
 
-def to_mlir(f: Function, cfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
-    """Run the same passes + translator as `compileit`, but return the
-    KunMLIR module before PTX/CUBIN.  External (cs_rank) partitions
+class Library:
+    """Bag of named `KunMLIR.Executable`s, mirroring the CPU `kr.Library`
+    shape so callers can compile multiple Functions in one go and look
+    them up by name.  Returned by the multi-Function `compileit` below.
+    """
+    def __init__(self, libname: str = "") -> None:
+        self.libname = libname
+        self._modules: dict = {}
+
+    def getModule(self, name: str) -> KunMLIR.Executable:
+        if name not in self._modules:
+            raise RuntimeError(
+                f"Library.getModule: no module named '{name}' "
+                f"(have: {sorted(self._modules)})")
+        return self._modules[name]
+
+    @property
+    def names(self):
+        """All compiled module names in registration order."""
+        return list(self._modules.keys())
+
+    def _add(self, name: str, exe: KunMLIR.Executable) -> None:
+        if name in self._modules:
+            raise RuntimeError(
+                f"Library: duplicate module name '{name}'")
+        self._modules[name] = exe
+
+
+def compileit(
+    funclist: List[Tuple[str, Function, KunCompilerConfig]],
+    libname: str,
+    compiler_config: CudaCompilerConfig,
+) -> Library:
+    """Compile a list of `(name, Function, KunCompilerConfig)` tuples
+    into a `Library`, mirroring the shape of
+    `KunQuant.jit.cfake.compileit(func, libname, compiler_config)`.
+
+    Each entry's third element is the per-Function `KunCompilerConfig`
+    (dtype / blocking_len / partition_factor / layout / pass options);
+    `compiler_config` is the GPU-wide `CudaCompilerConfig` applied to
+    every entry.  cfake's other arguments (`tempdir`, `keep_files`,
+    `load`) don't apply to the GPU path and are intentionally absent.
+
+    Returns a `Library` keyed by the tuple's `name`; look up individual
+    kernels via `lib.getModule(name)`.
+    """
+    lib = Library(libname=libname)
+    for name, f, kcfg in funclist:
+        lib._add(name, compile_func(f, kcfg, compiler_config))
+    return lib
+
+
+def to_mlir(f: Function, kcfg: KunCompilerConfig,
+              ccfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
+    """Run the same passes + translator as `compile_func`, but return
+    the KunMLIR module before PTX/CUBIN.  External (cs_rank) partitions
     are absent from the returned module — they never become kunir
     ops.  Useful for debugging the IR.  Mutates `f` in place (same
-    as `compileit`)."""
+    as `compile_func`)."""
+    _validate_kun_cfg(kcfg)
     _graph_io_names(f)              # raises if no Input / Output ops
-    impl = _run_full_pipeline(f, cfg)
-    mod, _externals = _translate_partitions(impl, cfg)
+    impl = _run_full_pipeline(f, kcfg)
+    mod, _externals = _translate_partitions(impl, kcfg, ccfg)
     return mod
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index f9a3b6f..19941b2 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -87,28 +87,23 @@ struct WTDesc {
 };
 using WTDescMap = llvm::DenseMap<Value, WTDesc>;
 
-// Per-function cache for chunk-related values shared across multiple
-// output-store rewrites.  Each gpu.func builds (at most) one mask
-// index_cast and one write_start SSA value; subsequent ts.put rewrites
-// against an output arg reuse them, so we don't lean on a downstream
-// CSE pass.
+// Per-function cache for the write-start SSA value shared across
+// multiple output-store rewrites.  Each gpu.func builds (at most) one
+// write_start; subsequent ts.put rewrites against an output arg reuse
+// it, so we don't lean on a downstream CSE pass.
 //
-// Both cached values are index-typed (not i32) because they're used as
-// scf.for / arith.cmpi operands against the loop induction variable
-// which is index-typed.  The runtime scalar args (mask, chunk_size,
-// warmup) are i32; the helpers below insert the i32 → index cast once
-// at function entry.
+// `writeStart` is index-typed (not i32) because it's compared against
+// the scf.for IV which is index-typed.  The runtime scalar args (mask,
+// chunk_size, warmup) come in as i32; the helper below inserts the
+// i32 → index cast once at function entry.
 //
-//   mask         : index, cast once from arg[2] (i32).  Used to shift
-//                  output indices: out[t - mask, sid].
 //   writeStart   : (block_id y == 0) ? mask : block_id y * chunk_size.
 //                  Output stores below this time-index are suppressed —
 //                  they fall in the warmup-overlap region.
 //
-// Both are emitted at the very top of the function entry block so they
-// dominate every store site, regardless of how deeply nested.
+// Emitted at the very top of the function entry block so it dominates
+// every store site, regardless of how deeply nested.
 struct ChunkContext {
-  Value mask;
   Value writeStart;
 };
 using ChunkCtxMap = llvm::DenseMap<Operation *, ChunkContext>;
@@ -348,32 +343,15 @@ struct TimeUbPattern : OpConversionPattern<TimeUbOp> {
 };
 
 //===----------------------------------------------------------------------===//
-// Chunk-context lazy helpers.  See ChunkContext above.
+// Chunk-context lazy helper.  See ChunkContext above.
 //
 // mask / chunk_size / warmup come in as i32 func args (positions 2 / 3 /
-// 4 after time_length / num_stocks).  We cast mask to index once per
-// function and cache the result, then build writeStart from it.  Both
-// emissions land at the very top of the function entry block so the
-// resulting SSA values dominate every store-site inside the kernel.
+// 4 after time_length / num_stocks).  We build writeStart from the i32
+// mask + chunk_size args, then cast to index once and cache.  Emitted
+// at the very top of the function entry block so the resulting SSA
+// value dominates every store-site inside the kernel.
 //===----------------------------------------------------------------------===//
 
-static Value getOrCreateMask(Operation *op, ChunkCtxMap &map,
-                              ConversionPatternRewriter &rewriter) {
-  auto fn = op->getParentOfType<gpu::GPUFuncOp>();
-  ChunkContext &ctx = map[fn.getOperation()];
-  if (ctx.mask) return ctx.mask;
-  // arg layout: (i32 time_length, i32 num_stocks, i32 mask, i32 chunk_size,
-  //              i32 warmup, ts...)
-  Value maskI32 = fn.getBody().front().getArgument(2);
-  Location loc = fn.getLoc();
-
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPointToStart(&fn.getBody().front());
-  ctx.mask = arith::IndexCastOp::create(rewriter, loc, rewriter.getIndexType(),
-                                                    maskI32);
-  return ctx.mask;
-}
-
 static Value getOrCreateWriteStart(Operation *op, ChunkCtxMap &map,
                                      ConversionPatternRewriter &rewriter) {
   auto fn = op->getParentOfType<gpu::GPUFuncOp>();
@@ -381,10 +359,7 @@ static Value getOrCreateWriteStart(Operation *op, ChunkCtxMap &map,
   if (ctx.writeStart) return ctx.writeStart;
 
   // Compute in i32 (cheap on GPU) then cast once to index, since the
-  // result is compared against the scf.for IV (index-typed).  We read
-  // the i32 mask and chunk_size args directly — not the cached index
-  // mask — so the mask helper and this helper don't depend on each
-  // other and either order is fine.
+  // result is compared against the scf.for IV (index-typed).
   Block &entry = fn.getBody().front();
   Value maskI32      = entry.getArgument(2);
   Value chunkSizeI32 = entry.getArgument(3);
@@ -708,19 +683,18 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
       LLVM::StoreOp::create(rewriter, loc, newPos, desc.posPtr);
       rewriter.eraseOp(op);
     } else {
-      // ── global ts: write at current time, gated by per-chunk write_start,
-      //    output index shifted by `mask` so the output array's time dim is
-      //    `time_length - mask`.
+      // ── global ts: write at current time, gated by per-chunk write_start.
+      //    Output time dim == time_length (== input time dim); the warmup
+      //    region [0, mask) is just left unwritten by the kernel.
       //
       //   if (t >= write_start)
-      //     out[t - mask, sid] = v
+      //     out[t, sid] = v
       //
       // The `t >= write_start` comparison is uniform across the CTA (all
       // threads share the same scf.for IV), so the lowered branch is a
       // single uniform predicate — no warp divergence at chunk boundaries.
       Value timeIdx    = getCurrentTimeIdx(op);
       Value writeStart = getOrCreateWriteStart(op, chunkCtx, rewriter);
-      Value mask       = getOrCreateMask(op, chunkCtx, rewriter);
 
       Value doWrite = arith::CmpIOp::create(
           rewriter, loc, arith::CmpIPredicate::sge, timeIdx, writeStart);
@@ -729,9 +703,8 @@ struct TsPutPattern : OpConversionPattern<TsPutOp> {
           /*withElseRegion=*/false);
 
       OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
-      Value tOut = arith::SubIOp::create(ib, loc, timeIdx, mask);
       Value gep = gmemGEPWithOffset(ib, loc, elemTy, ptrTy, tsPtr,
-                                     tOut, /*offsetIdx=*/Value(),
+                                     timeIdx, /*offsetIdx=*/Value(),
                                      getNumStocksI64(ib, op, loc),
                                      idxTy, i64Ty);
       LLVM::StoreOp::create(ib, loc, v, gep);
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index fc2be19..e301124 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -10,6 +10,11 @@
 string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
 string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
 
+# Re-run FindCUDAToolkit here so CUDA::cuda_driver is an imported target
+# in this directory's scope.  KunCuda already calls it but the IMPORTED
+# target it creates is scoped to that subdirectory.
+find_package(CUDAToolkit REQUIRED)
+
 # STABLE_ABI: single .abi3.so on CPython ≥ 3.12; falls back to per-version
 # on older Pythons.  Matches the runner binding (cpp/Python).
 nanobind_add_module(KunMLIR STABLE_ABI
@@ -28,6 +33,12 @@ set_target_properties(KunMLIR PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/KunQuant/jit")
 
 target_link_libraries(KunMLIR PRIVATE
+  # cuda.h + libcuda stub — the binding's runGraph wrapper calls
+  # cuMemAlloc / cuMemFree directly to back caller-omitted output
+  # buffers.  KunCudaRuntime links cuda_driver PRIVATE so we have to
+  # repeat the dependency here.
+  CUDA::cuda_driver
+
   # Compiler side
   MLIRKunIrDialect
   MLIRKunGpuDialect
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 172fd50..7efe45e 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/vector.h>
 #include <nanobind/stl/unique_ptr.h>
@@ -28,6 +29,8 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cuda.h>
+
 #include <memory>
 #include <sstream>
 #include <stdexcept>
@@ -181,101 +184,169 @@ static CudaArrayInfo readDLPack(nb::handle obj, const std::string &paramName,
   return CudaArrayInfo{ptr, t.shape[0], t.shape[1]};
 }
 
-/// Walk the user's {name → cuda_array} dict, validate that every named
-/// arg is present and that all arrays share the same (timeLength,
-/// numStocks).  Returns the common (T, S) plus a flat list of (name, ptr)
-/// pairs.  Anything binding-side (CAI parsing, dtype/ndim/shape checks)
-/// happens here so the runtime stays a thin launcher.
-struct CollectedArgs {
+/// Reject keys in `pyDict` that are not in `expectedNames`.  Used so the
+/// error message points at the offending name instead of complaining
+/// about a different missing key down the loop.
+///
+/// Fast path: when `pyDict.size() == expectedNames.size()` we skip the
+/// per-key scan.  Either every expected name is present (no unexpected
+/// key by definition) or one is missing — in the latter case the
+/// downstream missing-key check still raises with a correct (if less
+/// precise) error.
+static void rejectUnexpectedKeys(const nb::dict &pyDict,
+                                   const std::vector<std::string> &expectedNames,
+                                   const char *kind) {
+  if (pyDict.size() == expectedNames.size())
+    return;
+  for (auto kv : pyDict) {
+    std::string key = nb::cast<std::string>(kv.first);
+    bool known = false;
+    for (auto &n : expectedNames) if (n == key) { known = true; break; }
+    if (known) continue;
+    std::string expected;
+    for (size_t j = 0; j < expectedNames.size(); ++j) {
+      if (j) expected += ", ";
+      expected += expectedNames[j];
+    }
+    throw std::runtime_error(std::string("runGraph: unexpected ") + kind +
+                              " '" + key + "' (expected: " + expected + ")");
+  }
+}
+
+/// Walk `pyInputs` in `exe.graphInputs()` order, validate that every name
+/// is present and that all arrays share the input shape (timeLength,
+/// numStocks).  Caller specifies the required `timeLength` via
+/// `requiredTimeLength` (== start + length); a value of -1 means "infer
+/// from the first input" and the binding will treat that as the locked
+/// shape.
+struct CollectedInputs {
   int64_t timeLength;
   int64_t numStocks;
   std::vector<std::pair<std::string, uintptr_t>> args;
 };
 
-static CollectedArgs collectArgs(const kun_cuda::Executable &exe,
-                                   nb::dict pyArgs,
-                                   const nb::object &streamArg,
-                                   int64_t mask) {
-  // Graph inputs come first, then outputs — same as the buffer-table
-  // layout the runtime expects.
-  const size_t numInputs = exe.graphInputs().size();
-  std::vector<std::string> ordered;
-  ordered.reserve(numInputs + exe.graphOutputs().size());
-  for (auto &n : exe.graphInputs())  ordered.push_back(n);
-  for (auto &n : exe.graphOutputs()) ordered.push_back(n);
-  if (ordered.empty())
-    throw std::runtime_error("launch: kernel has no I/O arguments");
-
-  CollectedArgs out;
-  out.args.reserve(ordered.size());
-
-  // Reject extras up-front so the error message points at the offending
-  // name (the per-name loop below would otherwise just complain about a
-  // missing graph_input/output, which is misleading when the real issue
-  // is a typo'd key).
-  if (pyArgs.size() > ordered.size()) {
-    for (auto kv : pyArgs) {
-      std::string key = nb::cast<std::string>(kv.first);
-      bool known = false;
-      for (auto &n : ordered) if (n == key) { known = true; break; }
-      if (!known) {
-        std::string expected;
-        for (size_t j = 0; j < ordered.size(); ++j) {
-          if (j) expected += ", ";
-          expected += ordered[j];
-        }
-        throw std::runtime_error(
-            "launch: unexpected argument '" + key +
-            "' (kernel expects: " + expected + ")");
-      }
-    }
-  }
+static CollectedInputs collectInputs(const kun_cuda::Executable &exe,
+                                        const nb::dict &pyInputs,
+                                        const nb::object &streamArg,
+                                        int64_t requiredTimeLength) {
+  const auto &inputNames = exe.graphInputs();
+  rejectUnexpectedKeys(pyInputs, inputNames, "input");
 
-  // We need the input time length before validating any output (output
-  // time dim = input time dim − mask).  Walk inputs first to lock it
-  // in, then outputs.
-  out.timeLength = -1;
+  CollectedInputs out;
+  out.timeLength = requiredTimeLength;
   out.numStocks  = -1;
-  for (size_t i = 0; i < ordered.size(); ++i) {
-    const std::string &name = ordered[i];
-    bool isOutput = i >= numInputs;
+  out.args.reserve(inputNames.size());
 
+  for (const std::string &name : inputNames) {
     nb::object key = nb::str(name.c_str());
-    if (!pyArgs.contains(key)) {
-      std::string expected;
-      for (size_t j = 0; j < ordered.size(); ++j) {
-        if (j) expected += ", ";
-        expected += ordered[j];
-      }
-      throw std::runtime_error("launch: missing argument '" + name +
-                                "' (kernel expects: " + expected + ")");
-    }
-    CudaArrayInfo info = readDLPack(pyArgs[key], name, streamArg);
-    int64_t expectT = isOutput ? (out.timeLength - mask) : out.timeLength;
+    if (!pyInputs.contains(key))
+      throw std::runtime_error("runGraph: missing input '" + name + "'");
+    CudaArrayInfo info = readDLPack(pyInputs[key], name, streamArg);
 
     if (out.timeLength < 0) {
-      // First arg is always an input (numInputs ≥ 1 since the kernel
-      // graph requires at least one input).  Lock in the launch shape.
       out.timeLength = info.timeLength;
       out.numStocks  = info.numStocks;
-    } else if (info.timeLength != expectT ||
-                 info.numStocks  != out.numStocks) {
+    } else if (info.timeLength != out.timeLength ||
+                 (out.numStocks >= 0 && info.numStocks != out.numStocks)) {
       std::stringstream ss;
-      ss << "launch: shape mismatch on '" << name
-         << "' (" << (isOutput ? "output" : "input") << "): expected ("
-         << expectT << ", " << out.numStocks
-         << "), got (" << info.timeLength << ", "
-         << info.numStocks << ")";
-      if (isOutput && mask > 0)
-        ss << " — output time dim must equal input time dim ("
-           << out.timeLength << ") minus mask (" << mask << ")";
+      ss << "runGraph: input '" << name << "' has shape ("
+         << info.timeLength << ", " << info.numStocks
+         << "), expected (" << out.timeLength << ", "
+         << (out.numStocks < 0 ? info.numStocks : out.numStocks) << ")";
       throw std::runtime_error(ss.str());
     }
+    if (out.numStocks < 0)
+      out.numStocks = info.numStocks;
     out.args.emplace_back(name, info.ptr);
   }
   return out;
 }
 
+/// Allocate a CUDA device buffer of `total` floats and wrap it in an
+/// `nb::ndarray<>` (no framework annotation) owning the allocation via
+/// a capsule.  Lifetime is tied to the Python object: when the array's
+/// refcount drops to zero, the capsule destructor frees via `cuMemFree`.
+static nb::ndarray<> allocOwnedCudaArray2D(int64_t T, int64_t S) {
+  size_t total = static_cast<size_t>(T) * static_cast<size_t>(S);
+  CUdeviceptr p = 0;
+  CUresult r = cuMemAlloc(&p, total * sizeof(float));
+  if (r != CUDA_SUCCESS) {
+    const char *msg = nullptr;
+    cuGetErrorString(r, &msg);
+    throw std::runtime_error(std::string("runGraph: cuMemAlloc failed: ") +
+                              (msg ? msg : "(unknown)"));
+  }
+  nb::capsule owner(reinterpret_cast<void *>(p), [](void *q) noexcept {
+    cuMemFree(reinterpret_cast<CUdeviceptr>(q));
+  });
+  // device_id: query current context's device.  Falls back to 0 if no
+  // context is current (which should not happen here — cuMemAlloc just
+  // succeeded, so there is a current context).
+  CUdevice dev = 0;
+  cuCtxGetDevice(&dev);
+  size_t shape[2] = {static_cast<size_t>(T), static_cast<size_t>(S)};
+  return nb::ndarray<>(reinterpret_cast<void *>(p), /*ndim=*/2, shape, owner,
+                        /*strides=*/nullptr,
+                        /*dtype=*/nb::dtype<float>(),
+                        /*device_type=*/nb::device::cuda::value,
+                        /*device_id=*/static_cast<int>(dev));
+}
+
+/// Walk `exe.graphOutputs()` in order: for each name, either pick the
+/// caller-allocated buffer out of `pyOutputs` (validating shape) or
+/// allocate a fresh CUDA buffer.  Appends `(name, devicePtr)` to `args`
+/// and returns a `{name: ndarray}` dict of every output that Python
+/// will see.
+///
+/// When `pyOutputs.is_none()` we short-circuit:  no dict cast, no
+/// rejectUnexpectedKeys, no per-name `contains` probe — every output
+/// is auto-allocated.  This is the common case (caller doesn't pre-
+/// allocate outputs) and keeps it tight.
+static nb::dict collectOutputs(
+    const kun_cuda::Executable &exe,
+    nb::object pyOutputs, int64_t length, int64_t numStocks,
+    const nb::object &streamArg,
+    std::vector<std::pair<std::string, uintptr_t>> &args) {
+  const auto &outputNames = exe.graphOutputs();
+  args.reserve(args.size() + outputNames.size());
+
+  // Start with a null-PyObject* `nb::dict` — `nb::handle::inc_ref()` /
+  // `dec_ref()` are `Py_XINCREF`/`Py_XDECREF` so it's safe to hold, and
+  // we skip the `PyDict_New()` that bare `nb::dict()` would do.  Only
+  // populate + extras-check when the caller passed a real dict; then
+  // the handle's `operator bool()` doubles as the "user gave us
+  // outputs" flag.
+  nb::dict userOutputs = nb::steal<nb::dict>(nb::handle());
+  if (!pyOutputs.is_none()) {
+    userOutputs = nb::cast<nb::dict>(pyOutputs);
+    rejectUnexpectedKeys(userOutputs, outputNames, "output");
+  }
+
+  nb::dict ret;
+  for (const std::string &name : outputNames) {
+    nb::object key = nb::str(name.c_str());
+    uintptr_t base;
+    if (userOutputs && userOutputs.contains(key)) {
+      CudaArrayInfo info = readDLPack(userOutputs[key], name, streamArg);
+      if (info.timeLength != length || info.numStocks != numStocks) {
+        std::stringstream ss;
+        ss << "runGraph: output '" << name << "' has shape ("
+           << info.timeLength << ", " << info.numStocks
+           << "), expected (" << length << ", " << numStocks << ")";
+        throw std::runtime_error(ss.str());
+      }
+      base = info.ptr;
+      ret[key] = userOutputs[key];
+    } else {
+      nb::ndarray<> arr = allocOwnedCudaArray2D(length, numStocks);
+      base = reinterpret_cast<uintptr_t>(arr.data());
+      ret[key] = nb::cast(std::move(arr));
+    }
+    args.emplace_back(name, base);
+  }
+  return ret;
+}
+
 /// Parse one Python `external_kernels=[...]` entry into a KernelMeta.
 /// Expected dict shape:
 ///   {"name": str, "kind": str, "inputs": [str...], "outputs": [str...]}
@@ -463,18 +534,54 @@ NB_MODULE(KunMLIR, m) {
           "Raw stream handle as an int (0 ↔ CUDA default stream).")
       .def("runGraph",
           [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
-              nb::dict pyArgs, int64_t mask,
-              int minChunkWarmupFactor, double smFillFactor) {
+              nb::dict pyInputs, int64_t cur_time, int64_t length,
+              nb::object pyOutputs, int64_t mask,
+              int minChunkWarmupFactor, double smFillFactor) -> nb::dict {
+            if (cur_time != 0)
+              throw std::runtime_error(
+                  "runGraph: cur_time != 0 not supported on GPU");
+            if (length < 0)
+              throw std::runtime_error("runGraph: length must be >= 0");
+
+            // `length == 0` (default) → auto-infer from the first
+            // input's row count; otherwise it's the engine's internal
+            // time dim (== input rows == output rows).
+            const bool inferLength = (length == 0);
+
             // Thread the executor's stream into __dlpack__(stream=…)
             // so producers (CuPy / PyTorch / JAX / TF) can insert the
             // cross-stream sync needed for data-readiness on our
             // launch stream.
             nb::object streamArg = dlpackStreamArg(e.stream());
-            auto c = collectArgs(exe, pyArgs, streamArg, mask);
-            e.runGraph(exe, c.timeLength, c.numStocks, c.args,
+            auto in = collectInputs(exe, pyInputs, streamArg,
+                                       inferLength ? -1 : length);
+            if (inferLength)
+              length = in.timeLength;
+            if (mask < 0 || mask >= length)
+              throw std::runtime_error(
+                  "runGraph: mask must be in [0, length)");
+
+            // Kernel writes `output[t]` directly for `t ∈ [mask, length)`
+            // (kungpu codegen no longer subtracts mask).  Rows `[0, mask)`
+            // are left as whatever the user / allocator put there.
+            const int64_t timeLength = length;
+
+            // Build the args vector (inputs first, then outputs in
+            // exe.graphOutputs order).  Auto-allocates any output the
+            // caller didn't pre-allocate; returns the dict that goes
+            // back to Python.
+            std::vector<std::pair<std::string, uintptr_t>> args =
+                std::move(in.args);
+            nb::dict ret = collectOutputs(exe, pyOutputs, length,
+                                            in.numStocks, streamArg, args);
+
+            e.runGraph(exe, timeLength, in.numStocks, args,
                         mask, minChunkWarmupFactor, smFillFactor);
+            return ret;
           },
-          nb::arg("exe"), nb::arg("args"),
+          nb::arg("exe"), nb::arg("inputs"),
+          nb::arg("cur_time") = 0, nb::arg("length") = 0,
+          nb::arg("outputs") = nb::none(),
           nb::arg("mask") = 0,
           nb::arg("min_chunk_warmup_factor") = 4,
           nb::arg("sm_fill_factor") = 1.5,
@@ -482,14 +589,28 @@ NB_MODULE(KunMLIR, m) {
           "**Asynchronous** — call `.synchronize()` (or otherwise wait\n"
           "on the stream) before reading results back to host.\n"
           "\n"
-          "`args` is a {name → cupy_array} dict; names must equal "
-          "`exe.input_names ++ exe.output_names`.  Arrays must be "
-          "float32, 2-D, shape `(time_length, num_stocks)` (TS layout), "
-          "and reside on the GPU.\n"
+          "`inputs` is a {name → cuda_array} dict whose keys must equal\n"
+          "`exe.input_names`.  Arrays must be float32, 2-D, shape\n"
+          "`(length, num_stocks)` (TS layout), and reside on the GPU.\n"
+          "\n"
+          "`cur_time` mirrors CPU `kr.runGraph`; GPU only accepts 0.\n"
+          "\n"
+          "`length` is input/output time dim.  Default 0 ⇒ auto-infer\n"
+          "from the first input's row count.\n"
+          "\n"
+          "`outputs` is an optional {name → cuda_array} dict of\n"
+          "caller-allocated output buffers (subset of\n"
+          "`exe.output_names`).  Each must have shape `(length,\n"
+          "num_stocks)` (same as input).  Names missing from `outputs`\n"
+          "are auto-allocated by the binding (float32 CUDA buffers,\n"
+          "capsule-owned).  Returns a dict of every output name → its\n"
+          "buffer (user-supplied or freshly allocated).\n"
+          "\n"
+          "`mask` is the warmup-skip on graph outputs: the kernel only\n"
+          "writes to output rows `[mask, length)`; rows `[0, mask)` are\n"
+          "left untouched (whatever the user / allocator put there).\n"
+          "Default 0.\n"
           "\n"
-          "`mask` is the prefix-skip on graph outputs: chunk 0 starts "
-          "writing at time index `mask`, so the output array's time "
-          "dim is `time_length - mask`.  Default 0 (no skip).\n"
           "`min_chunk_warmup_factor` is the lower bound on "
           "`chunk_size / warmup` — keeps warmup-overlap overhead below "
           "`1 / factor` of total compute.  Default 4 (≤ 25% overhead).\n"
@@ -498,7 +619,7 @@ NB_MODULE(KunMLIR, m) {
           "slack.  Default 1.5.\n"
           "\n"
           "Named to match the CPU executor API "
-          "(`KunRunner.runGraph(executor, mod, ...)`).")
+          "(`KunRunner.runGraph(executor, mod, inputs, cur_time, length)`).")
       .def("synchronize", &kun_cuda::Executor::synchronize,
           "Block until every kernel queued on this stream completes.");
 
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index fc384c0..a73d08b 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -40,8 +40,14 @@
     Accumulator, SetAccumulator, ReturnFirstValue,
 )
 from KunQuant.Stage import Function
+from KunQuant.Driver import KunCompilerConfig
 from KunQuant.jit import KunMLIR
-from KunQuant.jit.cuda import compileit, CudaCompilerConfig
+from KunQuant.jit.cuda import compile_func, compileit, CudaCompilerConfig
+
+
+# GPU backend only supports TS layout; share one KunCompilerConfig across
+# every test that doesn't need to customise other graph-rewrite knobs.
+_KCFG_TS = KunCompilerConfig(input_layout="TS", output_layout="TS")
 
 
 def build_func_elemwise() -> Function:
@@ -214,9 +220,9 @@ def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
     """Compile a Function, launch it, validate against numpy."""
     print(f"=== {label} ===")
     f = build_fn()
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
 
@@ -227,8 +233,9 @@ def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
     out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h),
-                              "b": cp.asarray(b_h), "out": out})
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"out": out})
     out_h = cp.asnumpy(out)
 
     expected = expected_fn(a_h, b_h)
@@ -260,9 +267,9 @@ def run_libdevice(target: str, T: int, S: int) -> int:
 def run_backref(target: str, T: int, S: int, N: int) -> int:
     print(f"=== backref: out = (a+b)[t - {N}] ===")
     f = build_func_backref(N)
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
 
@@ -273,8 +280,9 @@ def run_backref(target: str, T: int, S: int, N: int) -> int:
     out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h),
-                              "b": cp.asarray(b_h), "out": out})
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"out": out})
     out_h = cp.asnumpy(out)
 
     # Reference: out[t] = (a+b)[t-N] for t >= N; undefined for t < N.
@@ -292,9 +300,9 @@ def run_backref(target: str, T: int, S: int, N: int) -> int:
 def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
     print(f"=== fast_windowed_sum: ws = FastWindowedSum(a + b, N={N}) ===")
     f = build_func_fastwindowedsum(N)
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
 
@@ -305,8 +313,9 @@ def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
     out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h),
-                              "b": cp.asarray(b_h), "ws": out})
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"ws": out})
     out_h = cp.asnumpy(out)
 
     # Reference matches WindowedSum (same window, no NaN inputs).
@@ -330,10 +339,11 @@ def run_multipartition(target: str, T: int, S: int) -> int:
     print("=== multipartition: 3 outputs (add/mul/sub) split via "
            "partition_factor=1 ===")
     f = build_func_multipartition()
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS",
                               partition_factor=1)
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, kcfg, ccfg)
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
     print(f"  launch_order           = {exe.launch_order}")
@@ -357,10 +367,11 @@ def run_multipartition(target: str, T: int, S: int) -> int:
     sub_out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h), "b": cp.asarray(b_h),
-                              "add_out": add_out,
-                              "mul_out": mul_out,
-                              "sub_out": sub_out})
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"add_out": add_out,
+                                 "mul_out": mul_out,
+                                 "sub_out": sub_out})
 
     add_h = cp.asnumpy(add_out)
     mul_h = cp.asnumpy(mul_out)
@@ -388,8 +399,8 @@ def run_accumulator(target: str, T: int, S: int) -> int:
     print(f"=== accumulator: cnt[t] = cnt[t-1] + (a[t] > 0)  "
            f"(whole-time sentinel) ===")
     f = build_func_accumulator()
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
-    exe = compileit(f, cfg)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
 
@@ -399,7 +410,9 @@ def run_accumulator(target: str, T: int, S: int) -> int:
     out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h), "cnt_out": out})
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h)},
+                       outputs={"cnt_out": out})
     out_h = cp.asnumpy(out)
 
     expected = np.cumsum((a_h > 0).astype(np.float32), axis=0)
@@ -416,9 +429,9 @@ def run_cmp_logical(target: str, T: int, S: int) -> int:
     """
     print("=== cmp/logical/select: 8 outputs exercising kunir bool ops ===")
     f = build_func_cmp_logical()
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
 
@@ -432,7 +445,9 @@ def run_cmp_logical(target: str, T: int, S: int) -> int:
     outs = {n: cp.zeros((T, S), dtype=cp.float32) for n in out_names}
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h), "b": cp.asarray(b_h), **outs})
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs=outs)
 
     def ref(cond: np.ndarray) -> np.ndarray:
         return np.where(cond, a_h, b_h)
@@ -471,8 +486,8 @@ def build_windowed(target: str, N: int):
     different T / S / mask (anything that doesn't change the graph
     topology or window size N)."""
     f = build_func_windowed(N)
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
-    exe = compileit(f, cfg)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  [build windowed N={N}] kernels={exe.kernel_names}  "
            f"num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
@@ -489,9 +504,10 @@ def test_windowed(exe, T: int, S: int, N: int, mask: int = 0) -> int:
                                                     outer ts c (c[t]).
        (c = a + b, k in [0..N-1])
 
-    With `mask > 0` the output time dim shrinks by `mask` and the kernel
-    runs with that mask — exercises the multi-chunk + mask path
-    (chunk-local `t - loop_lb >= window` guard) for both outputs.
+    With `mask > 0` the kernel only writes rows `[mask, T)` of every
+    output (rows `[0, mask)` are warmup and stay at the allocator's
+    initial value).  Exercises the multi-chunk + mask path (chunk-local
+    `t - loop_lb >= window` guard) for both outputs.
 
     `exe` must have been compiled with the matching `N`.
     """
@@ -504,40 +520,37 @@ def test_windowed(exe, T: int, S: int, N: int, mask: int = 0) -> int:
     rng = np.random.default_rng(1)
     a_h = rng.standard_normal((T, S), dtype=np.float32)
     b_h = rng.standard_normal((T, S), dtype=np.float32)
-    out_T      = T - mask
-    ws_out     = cp.zeros((out_T, S), dtype=cp.float32)
-    maxabs_out = cp.zeros((out_T, S), dtype=cp.float32)
+    # Output is the same shape as input; the binding leaves rows
+    # `[0, mask)` untouched.
+    ws_out     = cp.zeros((T, S), dtype=cp.float32)
+    maxabs_out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    inputs = {"a": cp.asarray(a_h), "b": cp.asarray(b_h),
-              "ws": ws_out, "ws_maxabs": maxabs_out}
-    if mask:
-        executor.runGraph(exe, inputs, mask=mask)
-    else:
-        executor.runGraph(exe, inputs)
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       mask=mask,
+                       outputs={"ws": ws_out, "ws_maxabs": maxabs_out})
     ws_h     = cp.asnumpy(ws_out)
     maxabs_h = cp.asnumpy(maxabs_out)
 
-    # Build full-T references, then slice from `mask` onward (no-op when
-    # mask == 0).  Output row i ↔ input time i+mask; reliable when
-    # i + mask >= N - 1.
+    # Reference is the full-T factor: output[t] = factor at time t.
     c = a_h + b_h
     cumsum = np.cumsum(c, axis=0, dtype=np.float64)
-    ws_full = np.empty((T, S), dtype=np.float32)
-    ws_full[:N - 1] = np.nan
-    ws_full[N - 1] = cumsum[N - 1]
+    ws_expected = np.empty((T, S), dtype=np.float32)
+    ws_expected[:N - 1] = np.nan
+    ws_expected[N - 1] = cumsum[N - 1]
     if T > N:
-        ws_full[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
-    ws_expected = ws_full[mask:]
+        ws_expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
 
-    maxabs_full = np.empty((T, S), dtype=np.float32)
-    maxabs_full[:N - 1] = np.nan
+    maxabs_expected = np.empty((T, S), dtype=np.float32)
+    maxabs_expected[:N - 1] = np.nan
     for t in range(N - 1, T):
         window = c[t - N + 1 : t + 1]                     # (N, S)
-        maxabs_full[t] = np.max(np.abs(window - c[t]), axis=0)
-    maxabs_expected = maxabs_full[mask:]
+        maxabs_expected[t] = np.max(np.abs(window - c[t]), axis=0)
 
-    valid_start = max(0, N - 1 - mask)
+    # Valid-from-row: the later of the kernel-written region (mask) and
+    # the windowed-op warmup (N - 1).
+    valid_start = max(mask, N - 1)
     rc = 0
     rc |= _compare_post_warmup(ws_h, ws_expected,
                                   valid_start=valid_start,
@@ -559,9 +572,9 @@ def run_backref_with_mask(target: str, T: int, S: int, N: int,
     print(f"=== backref + mask: out = (a+b)[t - {N}], mask={mask} ===")
     assert 0 < mask < T, "test requires 0 < mask < T"
     f = build_func_backref(N)
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, _KCFG_TS, ccfg)
     print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
            f"peak_intermediate_slots={exe.peak_intermediate_slots}")
 
@@ -569,31 +582,91 @@ def run_backref_with_mask(target: str, T: int, S: int, N: int,
     rng = np.random.default_rng(4)
     a_h = rng.standard_normal((T, S), dtype=np.float32)
     b_h = rng.standard_normal((T, S), dtype=np.float32)
-    # Output time dim shrinks by mask.
-    out = cp.zeros((T - mask, S), dtype=cp.float32)
+    # Output same shape as input; the binding leaves rows `[0, mask)`
+    # untouched.
+    out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": cp.asarray(a_h),
-                              "b": cp.asarray(b_h), "out": out},
-                       mask=mask)
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       mask=mask,
+                       outputs={"out": out})
     out_h = cp.asnumpy(out)
 
-    # Reference: out_full[t] = (a+b)[t-N] for t ≥ N; undefined for t < N.
-    # With mask, out_full[mask + i] lands at out_h[i].  Reliable when
-    # mask + i ≥ N, i.e., i ≥ max(0, N - mask).
+    # Reference: expected[t] = (a+b)[t-N] for t ≥ N; NaN for t < N.
     c = a_h + b_h
-    valid_start = max(0, N - mask)
-    # Build a full-(T-mask) expected so _compare_post_warmup can validate
-    # the post-warmup tail uniformly (matches the windowed test below).
-    expected = np.empty((T - mask, S), dtype=np.float32)
-    expected[:valid_start] = np.nan
-    if valid_start < T - mask:
-        in_time = np.arange(mask + valid_start, T)
-        expected[valid_start:] = c[in_time - N]
+    expected = np.empty((T, S), dtype=np.float32)
+    expected[:N] = np.nan
+    expected[N:] = c[:T - N]
+    # Valid-from-row: later of the kernel-written region (mask) and the
+    # BackRef warmup (N).
+    valid_start = max(mask, N)
     return _compare_post_warmup(out_h, expected,
                                   valid_start=valid_start, atol=1e-5)
 
 
+def run_library(target: str, T: int, S: int) -> int:
+    """Exercise the multi-Function `compileit` shape and `Library.getModule`,
+    plus the auto-allocated-output path on `Executor.runGraph` (omitting
+    `outputs=` so the binding allocates fresh nb::ndarrays for every
+    graph output).
+
+    Two independent functions are compiled into one `Library`:
+      * elemwise_kernel : out = (a+b)*a - b*b
+      * libdevice_kernel: out = log(abs(a)) * sign(b - a)
+    """
+    print("=== library: multi-Function compileit + Library.getModule + "
+           "auto-allocated outputs ===")
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    funclist = [
+        ("elemwise_kernel",  build_func_elemwise(),  _KCFG_TS),
+        ("libdevice_kernel", build_func_libdevice(), _KCFG_TS),
+    ]
+    lib = compileit(funclist, "test_library", ccfg)
+    print(f"  library modules = {lib.names}")
+    assert set(lib.names) == {"elemwise_kernel", "libdevice_kernel"}, lib.names
+
+    import cupy as cp
+    rng = np.random.default_rng(31)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    inputs = {"a": cp.asarray(a_h), "b": cp.asarray(b_h)}
+
+    executor = KunMLIR.Executor()
+    rc = 0
+    expected_by_name = {
+        "elemwise_kernel":  (a_h + b_h) * a_h - b_h * b_h,
+        "libdevice_kernel": np.log(np.abs(a_h)) * np.sign(b_h - a_h),
+    }
+    tol_by_name = {"elemwise_kernel": 1e-5, "libdevice_kernel": 1e-4}
+    for mod_name, expected in expected_by_name.items():
+        exe = lib.getModule(mod_name)
+        # No `outputs=`: the binding auto-allocates a CUDA buffer for "out"
+        # and hands it back in the returned dict.  Re-wrap via DLPack so
+        # cupy treats it as a managed cupy array we can copy back to host.
+        ret = executor.runGraph(exe, inputs=inputs)
+        assert set(ret.keys()) == {"out"}, ret.keys()
+        out_h = cp.asnumpy(cp.from_dlpack(ret["out"]))
+        if not np.allclose(out_h, expected,
+                            atol=tol_by_name[mod_name], equal_nan=True):
+            diff = np.abs(out_h - expected)
+            print(f"  FAIL {mod_name} — max abs diff "
+                   f"{np.nanmax(diff):.3e}", file=sys.stderr)
+            rc = 1
+        else:
+            print(f"  ok {mod_name} — auto-allocated output matches reference")
+
+    # Library getModule on an unknown name must raise.
+    try:
+        lib.getModule("does_not_exist")
+        print("  FAIL — getModule('does_not_exist') should have raised",
+                file=sys.stderr)
+        rc = 1
+    except RuntimeError:
+        print("  ok — getModule on unknown name raised")
+    return rc
+
+
 def main() -> int:
     ap = argparse.ArgumentParser()
     ap.add_argument("--target", default="sm_120")
@@ -653,6 +726,8 @@ def main() -> int:
     rc |= run_accumulator(args.target, args.time_length, args.num_stocks)
     print()
     rc |= run_cmp_logical(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_library(args.target, args.time_length, args.num_stocks)
     return rc
 
 

From 9d08b0c289ce25c0d50f8831556bd1e90edf4735 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 17 May 2026 19:10:13 -0700
Subject: [PATCH 30/59] basic runtime test

---
 KunQuant/passes/CodegenMLIR.py |  18 +++
 tests/test_runtime.py          | 256 ++++++++++++++++++++++++++++-----
 2 files changed, 238 insertions(+), 36 deletions(-)

diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 39e049c..222713c 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -32,6 +32,7 @@
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Sign,
+    AddConst, SubConst, MulConst, DivConst,
     GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
     And, Or, Not, Select,
 )
@@ -55,6 +56,12 @@
     Equals:       "eq",
     And:          "and_", Or:         "or_",
 }
+# Const-on-one-side variants — emit ConstantOp + the matching binary op.
+# `swap=True` puts the scalar on the LEFT (e.g. SubConst(x, v, swap=True)
+# means `v - x`, where for plain SubConst it would mean `x - v`).
+_BINARY_CONST = {
+    AddConst: "add", SubConst: "sub", MulConst: "mul", DivConst: "div",
+}
 _UNARY = {
     Abs: "abs", Log: "log", Sign: "sign",
     Not: "not_",
@@ -130,6 +137,17 @@ def _emit_simple(op: OpBase,
         getattr(ir, _BINARY[cls])
         return getattr(ir, _BINARY[cls])(val_map[op.inputs[0]],
                                            val_map[op.inputs[1]])
+    if cls in _BINARY_CONST:
+        # Materialize the scalar attr as a kunir.constant, then emit
+        # the matching binary op.  `swap=True` puts the scalar on the
+        # left-hand side (matters for Sub/Div, no-op for Add/Mul).
+        scalar = float(op.attrs["value"])
+        const_val = ir.constant(scalar, ts_1)
+        x = val_map[op.inputs[0]]
+        ir_op = getattr(ir, _BINARY_CONST[cls])
+        if op.attrs.get("swap", False):
+            return ir_op(const_val, x)
+        return ir_op(x, const_val)
     if cls in _UNARY:
         return getattr(ir, _UNARY[cls])(val_map[op.inputs[0]])
     if isinstance(op, WindowedTempOutput):
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index de7c9d9..8021db1 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -1,4 +1,6 @@
 from KunQuant.Driver import KunCompilerConfig
+import argparse
+import dataclasses
 import numpy as np
 import pandas as pd
 import sys
@@ -14,6 +16,177 @@
 import sys
 from KunQuant.jit.env import cpu_arch
 
+
+# ── Backend dispatch (CPU vs GPU) ────────────────────────────────────
+#
+# `--gpu-arch sm_XX` flips us into GPU mode: every compile / executor /
+# runGraph call below the line goes through KunMLIR / KunQuant.jit.cuda
+# instead of cfake / KunRunner.  Tests that the GPU backend can't run
+# yet (STs layout, double dtype, streaming, custom cross-sectional C++,
+# aggregrate/corrwith helpers, Library.load) are skipped — see
+# `_GPU_SKIP_TESTS` below.
+_argp = argparse.ArgumentParser()
+_argp.add_argument("--gpu-arch", default="",
+                    help="GPU compute capability (e.g. sm_80).  Empty = CPU.")
+_args, _ = _argp.parse_known_args()
+GPU_MODE = bool(_args.gpu_arch)
+GPU_ARCH = _args.gpu_arch
+
+if GPU_MODE:
+    import cupy as cp
+    from KunQuant.jit import KunMLIR as _kr_mlir
+    from KunQuant.jit import cuda as _cuda_jit
+    # KunMLIR.compile + cuMemAlloc inherit the calling thread's primary
+    # CUDA context — touch a cupy allocator early so it exists.
+    cp.cuda.Device(0).use()
+    cp.zeros((1,), dtype=cp.float32)
+
+
+def compileit(funclist, libname):
+    """Backend-aware wrapper around `cfake.compileit` / `cuda.compileit`.
+    Both have the same `(funclist, libname, compiler_config)` shape; we
+    just pick the right compiler_config based on `GPU_MODE`.
+
+    GPU codegen only supports `TS` layout, so when a test built a
+    function with `STs` we transparently flip the layout flags to `TS`
+    here.  The matching `runGraph` wrapper does the input/output
+    blocking-shape reshape, keeping the test source unchanged."""
+    if GPU_MODE:
+        gpu_funclist = []
+        for name, f, kcfg in funclist:
+            if kcfg.input_layout == "STs" or kcfg.output_layout == "STs":
+                kcfg = dataclasses.replace(kcfg, input_layout="TS",
+                                                  output_layout="TS")
+            gpu_funclist.append((name, f, kcfg))
+        ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=GPU_ARCH)
+        return _cuda_jit.compileit(gpu_funclist, libname, ccfg)
+    return cfake.compileit(funclist, libname,
+                            cfake.CppCompilerConfig(machine=get_compiler_flags()))
+
+
+def createSingleThreadExecutor():
+    if GPU_MODE:
+        return _kr_mlir.Executor()
+    return kr.createSingleThreadExecutor()
+
+
+def createMultiThreadExecutor(n):
+    if GPU_MODE:
+        return _kr_mlir.Executor()
+    return kr.createMultiThreadExecutor(n)
+
+
+def _sts_unblock(blocked: np.ndarray) -> np.ndarray:
+    """STs blocked input `(S/blocking, T, blocking)`  →  TS dense
+    `(T, S)`.  test_runtime.py's STs convention has the stock axis
+    outer and time inner, so we transpose `(1, 0, 2)` before reshaping
+    the stock-block + lane back into one S axis."""
+    Sb, T, blocking = blocked.shape
+    return np.ascontiguousarray(
+        blocked.transpose((1, 0, 2)).reshape((T, Sb * blocking)))
+
+
+def _sts_reblock(flat: np.ndarray, blocking: int) -> np.ndarray:
+    """TS dense `(T, S)`  →  STs blocked `(S/blocking, T, blocking)`.
+    Inverse of `_sts_unblock`; pull `blocking` from the source rather
+    than re-deriving from dtype so it stays consistent with whatever
+    the test used to block the input."""
+    T, S = flat.shape
+    return np.ascontiguousarray(
+        flat.reshape((T, S // blocking, blocking)).transpose((1, 0, 2)))
+
+
+def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
+    """Backend-aware `kr.runGraph`.  CPU path is a pass-through; GPU
+    path moves numpy inputs to cupy, runs, syncs, and copies results
+    back into the caller-supplied numpy outputs (if any).  Returns the
+    `{name: numpy_ndarray}` dict the CPU runtime also returns.
+
+    STs-blocked (3-D) inputs are transparently unblocked to TS before
+    launch; outputs are re-blocked to match.  The matching `compileit`
+    wrapper has already rewritten the function's layout attr to `TS`,
+    so the kunir codegen never sees `STs`."""
+    if not GPU_MODE:
+        return kr.runGraph(executor, modu, inputs, cur_time, length,
+                            outputs if outputs is not None else {})
+
+    # Strip STs blocking on inputs; remember the blocking factor so we
+    # can re-block matching outputs.  1-D inputs (e.g. __init single-
+    # value) pass through untouched.
+    blocking = None
+    ts_inputs = {}
+    for k, v in inputs.items():
+        if v.ndim == 3:
+            blocking = v.shape[-1]
+            ts_inputs[k] = _sts_unblock(v)
+        else:
+            ts_inputs[k] = v
+
+    gpu_inputs = {k: cp.asarray(v) for k, v in ts_inputs.items()}
+    ret = executor.runGraph(modu, gpu_inputs,
+                              cur_time=cur_time, length=length)
+    executor.synchronize()
+
+    out_np = {}
+    for k, v in ret.items():
+        arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+        host = cp.asnumpy(arr)
+        if blocking is not None:
+            host = _sts_reblock(host, blocking)
+        if outputs is not None and k in outputs:
+            outputs[k][...] = host
+            out_np[k] = outputs[k]
+        else:
+            out_np[k] = host
+    return out_np
+
+
+# Tests not yet runnable through the GPU backend (STs / double / stream /
+# unsupported ops / aggregrate / corrwith / Library.load).  Anything else
+# is attempted in GPU mode.
+_GPU_SKIP_TESTS = {
+    "test_stream_lifetime_gh_issue_41",
+    "test_corrwith",
+    "test_aggregrate",
+    "test_runtime",
+    "test_avg_stddev",         # WindowedStddev needs Sqrt (not in CodegenMLIR)
+    "test_avg_stddev_TS",      # double dtype
+    "test_rank2",              # double dtype
+    "test_rank029",            # double dtype
+    "test_log",                # split: float32 may work, float64 unsupported
+    "test_pow",                # Pow decomposes to Exp/Log, Exp missing
+    "test_ema",                # ExpMovingAvg not in CodegenMLIR
+    "test_ema_init",           # same
+    "test_argmin_issue19",     # ReduceArgMin / ReduceRank not in CodegenMLIR
+    "test_aligned",            # CPU-only shape-error check
+    "test_skew_kurt",          # double + WindowedSkew/Kurt
+    "test_loop_index",         # WindowedMaxDrawdown / WindowLoopIndex
+    "test_covar",              # double + WindowedCovariance/Correlation
+    "test_quantile",           # double + SkipList
+    "test_large_rank",         # double + large-window SkipList
+    "test_stream_double",
+    "test_repro_crash_gh_issue_71",
+    "test_generic_cross_sectional",
+}
+
+# Names from `check_xxx()` factory tuples that GPU can actually compile.
+# Anything not in here is filtered out of the lib funclist before
+# `compileit` runs on the GPU side — keeps the build green even though
+# most check_xxx entries still produce unsupported kunir.
+_GPU_LIB_NAMES = {
+    "test_rank",
+}
+
+
+def _run(fn, *args, **kwargs):
+    """Call `fn(*args, **kwargs)` unless we're in GPU mode and `fn` is in
+    `_GPU_SKIP_TESTS` — then just print and return.  Keeps the dispatch
+    block at the bottom of the file unchanged in shape."""
+    if GPU_MODE and fn.__name__ in _GPU_SKIP_TESTS:
+        print(f"[skip on GPU] {fn.__name__}")
+        return
+    fn(*args, **kwargs)
+
 def test_aggregrate(dtype):
     a = np.random.rand(240, 16).astype(dtype)
     b = np.random.rand(240, 16).astype(dtype)
@@ -128,13 +301,14 @@ def test_cfake():
         inp2 = Input("b")
         Output(inp1 * inp2 + 10, "out")
     f = Function(builder.ops)
-    lib = cfake.compileit([("test1", f, cfake.KunCompilerConfig(input_layout="TS", output_layout="TS"))],
-        "cfaketest", cfake.CppCompilerConfig(machine=get_compiler_flags()))
+    lib = compileit(
+        [("test1", f, KunCompilerConfig(input_layout="TS", output_layout="TS"))],
+        "cfaketest")
     mod = lib.getModule("test1")
     inp = np.random.rand(10, 24).astype("float32")
     inp2 = np.random.rand(10, 24).astype("float32")
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, mod, {"a": inp, "b": inp2}, 0, 10)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, mod, {"a": inp, "b": inp2}, 0, 10)
     np.testing.assert_allclose(inp * inp2 + 10, out["out"])
 
 def test_runtime(libpath):
@@ -398,8 +572,8 @@ def check(inp, timelen):
         # print(df)
         expected = df.rank(pct=True, axis = 1).to_numpy().transpose()
         blocked = ST_ST8t(inp)
-        executor = kr.createSingleThreadExecutor()
-        out = kr.runGraph(executor, modu, {"a": blocked}, 0, timelen)
+        executor = createSingleThreadExecutor()
+        out = runGraph(executor, modu, {"a": blocked}, 0, timelen)
         output = ST8t_ST(out["ou2"])
         # print(expected[:,0])
         # print(output[:,0])
@@ -736,10 +910,16 @@ def rolling_max_dd(x, window_size, min_periods=1):
         expected[:,i] = rolling_max_dd(inp[:,i], 5, min_periods=1)
     np.testing.assert_allclose(output[5:], expected[5:], equal_nan=True, atol=1e-7, rtol=1e-7)
 
-test_stream_lifetime_gh_issue_41()
-test_corrwith()
-test_aggregrate("float32")
-test_aggregrate("float64")
+_run(test_stream_lifetime_gh_issue_41)
+_run(test_corrwith)
+_run(test_aggregrate, "float32")
+_run(test_aggregrate, "float64")
+# The shared library bundles all the lib-based tests.  CPU compiles
+# the whole thing; GPU keeps only entries in `_GPU_LIB_NAMES` since
+# the rest is STs / double / stream / ops the kunir codegen doesn't
+# support yet.  Lib-consuming tests that aren't in `_GPU_LIB_NAMES`
+# are in `_GPU_SKIP_TESTS`, so `_run` short-circuits before they ever
+# try to `lib.getModule(...)`.
 funclist = [
     check_1(),
     check_TS(),
@@ -762,30 +942,34 @@ def rolling_max_dd(x, window_size, min_periods=1):
     check_large_rank(),
     repro_crash_gh_issue_71(),
     ]
-lib = cfake.compileit(funclist, "test", cfake.CppCompilerConfig(machine=get_compiler_flags()))
-
-test_cfake()
-test_avg_stddev_TS(lib)
-kun_test_dll = os.path.join(cfake.get_runtime_path(), "KunTest.dll" if cfake.is_windows() else "libKunTest.so")
-if os.path.exists(kun_test_dll):
-    test_runtime(kun_test_dll)
-test_avg_stddev(lib)
-test_rank(lib)
-test_log(lib, "float32", "")
-test_pow(lib)
-test_ema(lib)
-test_ema_init(lib)
-test_argmin_issue19(lib)
-test_generic_cross_sectional()
-test_stream_double()
-test_log(lib, "float64", "64")
-test_rank2(lib)
-test_rank029(lib)
-test_skew_kurt()
-test_aligned(lib)
-test_loop_index()
-test_covar(lib)
-test_quantile(lib)
-test_large_rank(lib)
-test_repro_crash_gh_issue_71(lib)
+if GPU_MODE:
+    funclist = [t for t in funclist if t[0] in _GPU_LIB_NAMES]
+lib = compileit(funclist, "test")
+
+_run(test_cfake)
+_run(test_avg_stddev_TS, lib)
+if not GPU_MODE:
+    kun_test_dll = os.path.join(cfake.get_runtime_path(),
+                                  "KunTest.dll" if cfake.is_windows() else "libKunTest.so")
+    if os.path.exists(kun_test_dll):
+        _run(test_runtime, kun_test_dll)
+_run(test_avg_stddev, lib)
+_run(test_rank, lib)
+_run(test_log, lib, "float32", "")
+_run(test_pow, lib)
+_run(test_ema, lib)
+_run(test_ema_init, lib)
+_run(test_argmin_issue19, lib)
+_run(test_generic_cross_sectional)
+_run(test_stream_double)
+_run(test_log, lib, "float64", "64")
+_run(test_rank2, lib)
+_run(test_rank029, lib)
+_run(test_skew_kurt)
+_run(test_aligned, lib)
+_run(test_loop_index)
+_run(test_covar, lib)
+_run(test_quantile, lib)
+_run(test_large_rank, lib)
+_run(test_repro_crash_gh_issue_71, lib)
 print("done")

From 1b4595bc054c6b8c43b8b3d441b5227b91de1eac Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 17 May 2026 19:23:58 -0700
Subject: [PATCH 31/59] log, pow tests. mean-std WIP

---
 KunQuant/passes/CodegenMLIR.py |  4 ++--
 mlir/include/KunIr/KunIrOps.td |  6 ++++++
 mlir/lib/KunIr/KunIrOps.cpp    |  8 ++++++++
 mlir/lib/Python/IRBuilder.cpp  |  4 ++++
 tests/test_runtime.py          | 34 +++++++++++++++++++++-------------
 5 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 222713c..6d55ada 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -31,7 +31,7 @@
     ReductionOp, SimpleCrossSectionalOp, ConstantOp,
 )
 from KunQuant.ops.ElewiseOp import (
-    Add, Sub, Mul, Div, Max, Min, Abs, Log, Sign,
+    Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
     AddConst, SubConst, MulConst, DivConst,
     GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
     And, Or, Not, Select,
@@ -63,7 +63,7 @@
     AddConst: "add", SubConst: "sub", MulConst: "mul", DivConst: "div",
 }
 _UNARY = {
-    Abs: "abs", Log: "log", Sign: "sign",
+    Abs: "abs", Log: "log", Exp: "exp", Sqrt: "sqrt", Sign: "sign",
     Not: "not_",
     # NOTE: `Rank` is intentionally absent.  Cross-sectional rank
     # partitions are routed to a pre-compiled CUmodule by
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 6f45248..543b59b 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -79,6 +79,12 @@ def KunIr_AbsOp  : KunIr_UnaryElemwiseOp<"abs">  {
 def KunIr_LogOp  : KunIr_UnaryElemwiseOp<"log">  {
   let summary = "Element-wise natural logarithm";
 }
+def KunIr_ExpOp  : KunIr_UnaryElemwiseOp<"exp">  {
+  let summary = "Element-wise natural exponential";
+}
+def KunIr_SqrtOp : KunIr_UnaryElemwiseOp<"sqrt"> {
+  let summary = "Element-wise square root";
+}
 def KunIr_SignOp : KunIr_UnaryElemwiseOp<"sign"> {
   let summary = "Element-wise sign (-1, 0, 1)";
 }
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index a9b520b..e0c5ccb 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -87,6 +87,8 @@ LogicalResult OrOp::verify()  { return verifyLogicalBinary(*this, getLhs(), getR
 
 LogicalResult AbsOp::verify()  { return success(); }
 LogicalResult LogOp::verify()  { return success(); }
+LogicalResult ExpOp::verify()  { return success(); }
+LogicalResult SqrtOp::verify() { return success(); }
 LogicalResult SignOp::verify() { return success(); }
 
 LogicalResult NotOp::verify() {
@@ -523,6 +525,12 @@ Value AbsOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
 Value LogOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
   return math::LogOp::create(b, loc, operand);
 }
+Value ExpOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return math::ExpOp::create(b, loc, operand);
+}
+Value SqrtOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return math::SqrtOp::create(b, loc, operand);
+}
 Value SignOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
   // sign(x) ≈ copysign(1.0, x)
   Value one = arith::ConstantOp::create(
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 8f34f74..81c52f2 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -150,6 +150,8 @@ class IRBuilder {
 
   Value absOp(Value x)  { return makeUn<kunir::AbsOp>(x); }
   Value logOp(Value x)  { return makeUn<kunir::LogOp>(x); }
+  Value expOp(Value x)  { return makeUn<kunir::ExpOp>(x); }
+  Value sqrtOp(Value x) { return makeUn<kunir::SqrtOp>(x); }
   Value signOp(Value x) { return makeUn<kunir::SignOp>(x); }
 
   // ── Comparison + logical (binary, return ts<i1, 1>) ─────────────
@@ -355,6 +357,8 @@ void registerIRBuilder(nb::module_ &m) {
       .def("min",    &IRBuilder::minOp,    nb::arg("lhs"), nb::arg("rhs"))
       .def("abs",    &IRBuilder::absOp,    nb::arg("x"))
       .def("log",    &IRBuilder::logOp,    nb::arg("x"))
+      .def("exp",    &IRBuilder::expOp,    nb::arg("x"))
+      .def("sqrt",   &IRBuilder::sqrtOp,   nb::arg("x"))
       .def("sign",   &IRBuilder::signOp,   nb::arg("x"))
 
       // Comparison + logical (binary). Cmp ops return ts<i1, 1>;
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 8021db1..9f55be2 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -149,12 +149,11 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
     "test_corrwith",
     "test_aggregrate",
     "test_runtime",
-    "test_avg_stddev",         # WindowedStddev needs Sqrt (not in CodegenMLIR)
+    "test_avg_stddev",         # mean OK, but stddev decompose disagrees
+                                # with pandas — needs more debugging
     "test_avg_stddev_TS",      # double dtype
     "test_rank2",              # double dtype
     "test_rank029",            # double dtype
-    "test_log",                # split: float32 may work, float64 unsupported
-    "test_pow",                # Pow decomposes to Exp/Log, Exp missing
     "test_ema",                # ExpMovingAvg not in CodegenMLIR
     "test_ema_init",           # same
     "test_argmin_issue19",     # ReduceArgMin / ReduceRank not in CodegenMLIR
@@ -174,7 +173,9 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
 # `compileit` runs on the GPU side — keeps the build green even though
 # most check_xxx entries still produce unsupported kunir.
 _GPU_LIB_NAMES = {
-    "test_rank",
+    "test_rank",        # cross-sectional Rank (external cs_rank kernel)
+    "test_log",         # float32 only — float64 call gated below
+    "test_pow",         # Pow → Exp(Log(...) * expo) + Sqrt special-case
 }
 
 
@@ -182,9 +183,16 @@ def _run(fn, *args, **kwargs):
     """Call `fn(*args, **kwargs)` unless we're in GPU mode and `fn` is in
     `_GPU_SKIP_TESTS` — then just print and return.  Keeps the dispatch
     block at the bottom of the file unchanged in shape."""
-    if GPU_MODE and fn.__name__ in _GPU_SKIP_TESTS:
-        print(f"[skip on GPU] {fn.__name__}")
-        return
+    if GPU_MODE:
+        name = fn.__name__
+        if name in _GPU_SKIP_TESTS:
+            print(f"[skip on GPU] {name}")
+            return
+        # test_log(lib, dtype, name): GPU only has f32 kunir today;
+        # the f64 invocation has to skip.
+        if name == "test_log" and len(args) >= 2 and args[1] == "float64":
+            print(f"[skip on GPU] {name} {args[1]}")
+            return
     fn(*args, **kwargs)
 
 def test_aggregrate(dtype):
@@ -366,8 +374,8 @@ def test_avg_stddev(lib):
     expected_mean = df.rolling(10).mean().to_numpy().transpose()
     expected_stddev = df.rolling(10).std().to_numpy().transpose()
     blocked = ST_ST8t(inp)
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": blocked}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": blocked}, 0, 20)
     outmean = ST8t_ST(out["ou1"])
     outstd = ST8t_ST(out["ou2"])
     np.testing.assert_allclose(outmean, expected_mean, rtol=1e-6, equal_nan=True)
@@ -682,8 +690,8 @@ def test_log(lib, dtype, name):
     inp[1,:] = np.nan
     # print(inp)
     blocked = ST_ST8t(inp, is_double=(dtype=="float64"))
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": blocked}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": blocked}, 0, 20)
     output = ST8t_ST(out["outlog"])
     # print(expected[:,0])
     # print(output[:,0])
@@ -720,8 +728,8 @@ def test_pow(lib):
         expo[i,:] = pow(10, i/8-1)
     expo[-1,:] = 0
     expo[1,:] = np.nan
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": ST_ST8t(base), "b": ST_ST8t(expo)}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": ST_ST8t(base), "b": ST_ST8t(expo)}, 0, 20)
     # print(out.keys())
     # print(expected[:,0])
     # print(output[:,0])

From 9f1b9a55bd6701480251769a61e5f188b69971fa Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 17 May 2026 19:34:55 -0700
Subject: [PATCH 32/59] fix time index

---
 mlir/lib/KunGpu/KunGpuToLLVM.cpp | 15 +++++++++++++--
 tests/test_runtime.py            |  3 +--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 19941b2..1fc8c3c 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -250,8 +250,19 @@ static Value getNumStocksI64(OpBuilder &b, Operation *op, Location loc) {
   return arith::ExtSIOp::create(b, loc, b.getI64Type(), ns32);
 }
 static Value getCurrentTimeIdx(Operation *op) {
-  auto fOp = op->getParentOfType<scf::ForOp>();
-  return fOp ? fOp.getInductionVar() : Value();
+  // The enclosing function may contain nested scf.for's — outermost is
+  // the per-thread time loop, inner ones come from for_each_back_window
+  // bodies.  Reads/writes against a global ts (function-arg or graph
+  // intermediate) must use the OUTER time loop's IV regardless of how
+  // deep they sit; `op->getParentOfType<scf::ForOp>()` would otherwise
+  // grab the FBW's window-step IV and produce gmem addresses indexed
+  // by `w ∈ [0, window)` instead of the actual time `t`.
+  scf::ForOp outermost;
+  for (Operation *p = op->getParentOp(); p; p = p->getParentOp()) {
+    if (auto f = dyn_cast<scf::ForOp>(p))
+      outermost = f;
+  }
+  return outermost ? outermost.getInductionVar() : Value();
 }
 
 // linear gmem address = base + (timeIdx - offsetIdx) * num_stocks + stock_id
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 9f55be2..3a010ea 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -149,8 +149,6 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
     "test_corrwith",
     "test_aggregrate",
     "test_runtime",
-    "test_avg_stddev",         # mean OK, but stddev decompose disagrees
-                                # with pandas — needs more debugging
     "test_avg_stddev_TS",      # double dtype
     "test_rank2",              # double dtype
     "test_rank029",            # double dtype
@@ -173,6 +171,7 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
 # `compileit` runs on the GPU side — keeps the build green even though
 # most check_xxx entries still produce unsupported kunir.
 _GPU_LIB_NAMES = {
+    "avg_and_stddev",   # WindowedAvg + WindowedStddev (Sqrt + FBW over input)
     "test_rank",        # cross-sectional Rank (external cs_rank kernel)
     "test_log",         # float32 only — float64 call gated below
     "test_pow",         # Pow → Exp(Log(...) * expo) + Sqrt special-case

From 0e3d988d2bcff33571b2a9a3cdf52445bb93bf41 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 17 May 2026 20:15:37 -0700
Subject: [PATCH 33/59] enable f64

---
 mlir/include/KunCuda/Runtime.h  | 18 +++++++++
 mlir/lib/KunCuda/Runtime.cpp    |  3 +-
 mlir/lib/KunGpu/PtxBackend.cpp  | 40 +++++++++++++++++++
 mlir/lib/Python/MlirBinding.cpp | 49 +++++++++++++++---------
 tests/test_runtime.py           | 68 +++++++++++++++++++--------------
 5 files changed, 130 insertions(+), 48 deletions(-)

diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index c38ae78..8f55628 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -74,6 +74,18 @@ enum class KernelKind : int32_t {
   ExtCsRankF64 = 2,
 };
 
+/// Per-kernel element type.  Currently single-precision (f32) and
+/// double-precision (f64) are supported.  Determines the byte size used
+/// when allocating intermediate slots and validating user-supplied I/O.
+enum class Datatype : int32_t {
+  Float  = 0,   ///< f32 — 4 bytes/elem
+  Double = 1,   ///< f64 — 8 bytes/elem
+};
+
+inline size_t bytesPerElem(Datatype dt) noexcept {
+  return dt == Datatype::Double ? 8u : 4u;
+}
+
 /// Per-kernel metadata, in name form.  This is what the compiler can
 /// produce by walking a single lowered llvm.func — no graph topology
 /// reasoning required.
@@ -104,6 +116,11 @@ struct ExecutableData {
                                      ///<   from numStocks (see
                                      ///<   launchExtCsRankKernel).
   int64_t vectorSize  = 1;          ///< from kungpu.target_spec (graph-wide)
+  Datatype dtype      = Datatype::Float;  ///< element type of every kernel
+                                           ///<   I/O.  Graph-wide; verified
+                                           ///<   at compile time.  Used by
+                                           ///<   the runtime to size the
+                                           ///<   intermediate slot pool.
   std::vector<KernelMeta> kernels;  ///< unordered set; runtime topo-sorts
   std::vector<std::string> graphInputs;
   std::vector<std::string> graphOutputs;
@@ -149,6 +166,7 @@ class Executable {
   const std::vector<std::string> &graphOutputs() const noexcept { return data_.graphOutputs; }
   int64_t warpsPerCta() const noexcept { return data_.warpsPerCta; }
   int64_t vectorSize()  const noexcept { return data_.vectorSize; }
+  Datatype dtype()      const noexcept { return data_.dtype; }
   size_t  numKernels()  const noexcept { return data_.kernels.size(); }
 
   // ── Accessors (runtime-resolved plan) ─────────────────────────────
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index f59c19a..c63f6ae 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -808,7 +808,8 @@ void Executable::ensureSlotPool(int64_t timeLength, int64_t numStocks) {
     return;
   }
   size_t bytesPerSlot = static_cast<size_t>(timeLength) *
-                          static_cast<size_t>(numStocks) * sizeof(float);
+                          static_cast<size_t>(numStocks) *
+                          bytesPerElem(data_.dtype);
   slotBufs_.resize(plan_->peakIntermediateSlots, 0);
   for (int i = 0; i < plan_->peakIntermediateSlots; ++i) {
     CUdeviceptr p = 0;
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
index f8c4449..de01643 100644
--- a/mlir/lib/KunGpu/PtxBackend.cpp
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -4,6 +4,8 @@
 #include "KunGpu/KunGpuUtils.h"
 #include "KunGpu/Pipelines.h"
 #include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
 
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -17,6 +19,8 @@
 #include "mlir/Pass/PassManager.h"
 
 #include "llvm/ADT/SmallVector.h"
+
+#include <optional>
 #include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
@@ -143,6 +147,41 @@ LogicalResult compileKunIrToPtx(ModuleOp module,
 LogicalResult compileKunIrToExecutable(ModuleOp module,
                                         const PtxCompileOptions &options,
                                         ::kun_cuda::ExecutableData &out) {
+  // Sample the kunir.func element type before the kunir → llvm lowering
+  // erases it.  Verify graph-wide uniformity at the same time — every
+  // kunir.func in the module must agree on dtype, otherwise the slot
+  // pool wouldn't have a single byte-size to use.
+  std::optional<::kun_cuda::Datatype> sampledDtype;
+  std::string dtypeOwner;
+  WalkResult dtypeWalk = module.walk([&](kunir::FuncOp f) -> WalkResult {
+    for (Type t : f.getFunctionTypeTyped().getInputs()) {
+      auto ts = dyn_cast<kunir::TsType>(t);
+      if (!ts) continue;
+      Type et = ts.getElementType();
+      ::kun_cuda::Datatype dt;
+      if (et.isF32())      dt = ::kun_cuda::Datatype::Float;
+      else if (et.isF64()) dt = ::kun_cuda::Datatype::Double;
+      else {
+        f.emitError("compileKunIrToExecutable: unsupported ts element "
+                    "type — only f32 and f64");
+        return WalkResult::interrupt();
+      }
+      if (!sampledDtype) {
+        sampledDtype = dt;
+        dtypeOwner   = f.getSymName().str();
+      } else if (*sampledDtype != dt) {
+        f.emitError("compileKunIrToExecutable: kunir.func '")
+            << f.getSymName() << "' has dtype "
+            << (dt == ::kun_cuda::Datatype::Double ? "f64" : "f32")
+            << " but earlier '" << dtypeOwner << "' had "
+            << (*sampledDtype == ::kun_cuda::Datatype::Double ? "f64" : "f32");
+        return WalkResult::interrupt();
+      }
+    }
+    return WalkResult::advance();
+  });
+  if (dtypeWalk.wasInterrupted()) return failure();
+
   // 1.  kunir → llvm dialect.  After this the gpu.module body is fully
   //     lowered and our discardable kungpu.* attrs sit on llvm.func ops.
   if (failed(lowerKunIrToLLVMDialect(module))) return failure();
@@ -215,6 +254,7 @@ LogicalResult compileKunIrToExecutable(ModuleOp module,
   out.cubin.assign(cubin.begin(), cubin.end());
   out.warpsPerCta = warpsPerCta;
   out.vectorSize  = vectorSize;
+  out.dtype       = sampledDtype.value_or(::kun_cuda::Datatype::Float);
   out.kernels     = std::move(kernels);
   return success();
 }
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 7efe45e..16e37bb 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -127,7 +127,8 @@ static void requireRowMajorContiguous2D(const std::string &paramName,
 /// the capsule fall out of scope at function exit — the underlying
 /// tensor stays alive because the user is still holding `obj`.
 static CudaArrayInfo readDLPack(nb::handle obj, const std::string &paramName,
-                                  const nb::object &streamArg) {
+                                  const nb::object &streamArg,
+                                  kun_cuda::Datatype expectedDtype) {
   if (!nb::hasattr(obj, "__dlpack__"))
     throw std::runtime_error(
         "'" + paramName + "' does not implement __dlpack__ — pass a CuPy "
@@ -159,20 +160,25 @@ static CudaArrayInfo readDLPack(nb::handle obj, const std::string &paramName,
         "'" + paramName + "' must be 2-D (got " +
         std::to_string(t.ndim) + "-D)");
 
-  // ── dtype: kDLFloat, 32-bit, 1 lane ─────────────────────────────────
-  if (t.dtype.code != kDLFloat || t.dtype.bits != 32 || t.dtype.lanes != 1)
+  // ── dtype: kDLFloat, matches executable's element type ──────────────
+  const uint8_t expectedBits =
+      expectedDtype == kun_cuda::Datatype::Double ? 64 : 32;
+  if (t.dtype.code != kDLFloat || t.dtype.bits != expectedBits ||
+      t.dtype.lanes != 1)
     throw std::runtime_error(
         "'" + paramName + "' DLPack dtype is (code=" +
         std::to_string(static_cast<int>(t.dtype.code)) +
         ", bits=" + std::to_string(static_cast<int>(t.dtype.bits)) +
         ", lanes=" + std::to_string(static_cast<int>(t.dtype.lanes)) +
-        ") — need float32 (kDLFloat, 32, 1)");
+        ") — kernel expects float" + std::to_string(expectedBits) +
+        " (kDLFloat, " + std::to_string(expectedBits) + ", 1)");
 
   // ── strides: NULL = row-major contiguous; else validate.  DLPack
   //    strides are in *elements*, not bytes — convert before checking.
+  const int64_t elemBytes = static_cast<int64_t>(kun_cuda::bytesPerElem(expectedDtype));
   if (t.strides) {
-    int64_t sb[2] = {t.strides[0] * 4, t.strides[1] * 4};
-    requireRowMajorContiguous2D(paramName, t.shape, sb, /*elemSize=*/4);
+    int64_t sb[2] = {t.strides[0] * elemBytes, t.strides[1] * elemBytes};
+    requireRowMajorContiguous2D(paramName, t.shape, sb, elemBytes);
   }
 
   // ── data pointer (apply byte_offset before handing to kernel) ───────
@@ -241,7 +247,7 @@ static CollectedInputs collectInputs(const kun_cuda::Executable &exe,
     nb::object key = nb::str(name.c_str());
     if (!pyInputs.contains(key))
       throw std::runtime_error("runGraph: missing input '" + name + "'");
-    CudaArrayInfo info = readDLPack(pyInputs[key], name, streamArg);
+    CudaArrayInfo info = readDLPack(pyInputs[key], name, streamArg, exe.dtype());
 
     if (out.timeLength < 0) {
       out.timeLength = info.timeLength;
@@ -262,14 +268,17 @@ static CollectedInputs collectInputs(const kun_cuda::Executable &exe,
   return out;
 }
 
-/// Allocate a CUDA device buffer of `total` floats and wrap it in an
-/// `nb::ndarray<>` (no framework annotation) owning the allocation via
-/// a capsule.  Lifetime is tied to the Python object: when the array's
-/// refcount drops to zero, the capsule destructor frees via `cuMemFree`.
-static nb::ndarray<> allocOwnedCudaArray2D(int64_t T, int64_t S) {
+/// Allocate a CUDA device buffer of `T*S` elements (`sizeof(elem) =
+/// bytesPerElem(dt)`) and wrap it in an `nb::ndarray<>` (no framework
+/// annotation) owning the allocation via a capsule.  Lifetime is tied
+/// to the Python object: when the array's refcount drops to zero, the
+/// capsule destructor frees via `cuMemFree`.
+static nb::ndarray<> allocOwnedCudaArray2D(int64_t T, int64_t S,
+                                              kun_cuda::Datatype dt) {
+  const size_t elemBytes = kun_cuda::bytesPerElem(dt);
   size_t total = static_cast<size_t>(T) * static_cast<size_t>(S);
   CUdeviceptr p = 0;
-  CUresult r = cuMemAlloc(&p, total * sizeof(float));
+  CUresult r = cuMemAlloc(&p, total * elemBytes);
   if (r != CUDA_SUCCESS) {
     const char *msg = nullptr;
     cuGetErrorString(r, &msg);
@@ -279,15 +288,15 @@ static nb::ndarray<> allocOwnedCudaArray2D(int64_t T, int64_t S) {
   nb::capsule owner(reinterpret_cast<void *>(p), [](void *q) noexcept {
     cuMemFree(reinterpret_cast<CUdeviceptr>(q));
   });
-  // device_id: query current context's device.  Falls back to 0 if no
-  // context is current (which should not happen here — cuMemAlloc just
-  // succeeded, so there is a current context).
   CUdevice dev = 0;
   cuCtxGetDevice(&dev);
   size_t shape[2] = {static_cast<size_t>(T), static_cast<size_t>(S)};
+  nb::dlpack::dtype npDtype =
+      dt == kun_cuda::Datatype::Double ? nb::dtype<double>()
+                                         : nb::dtype<float>();
   return nb::ndarray<>(reinterpret_cast<void *>(p), /*ndim=*/2, shape, owner,
                         /*strides=*/nullptr,
-                        /*dtype=*/nb::dtype<float>(),
+                        /*dtype=*/npDtype,
                         /*device_type=*/nb::device::cuda::value,
                         /*device_id=*/static_cast<int>(dev));
 }
@@ -327,7 +336,8 @@ static nb::dict collectOutputs(
     nb::object key = nb::str(name.c_str());
     uintptr_t base;
     if (userOutputs && userOutputs.contains(key)) {
-      CudaArrayInfo info = readDLPack(userOutputs[key], name, streamArg);
+      CudaArrayInfo info =
+          readDLPack(userOutputs[key], name, streamArg, exe.dtype());
       if (info.timeLength != length || info.numStocks != numStocks) {
         std::stringstream ss;
         ss << "runGraph: output '" << name << "' has shape ("
@@ -338,7 +348,8 @@ static nb::dict collectOutputs(
       base = info.ptr;
       ret[key] = userOutputs[key];
     } else {
-      nb::ndarray<> arr = allocOwnedCudaArray2D(length, numStocks);
+      nb::ndarray<> arr =
+          allocOwnedCudaArray2D(length, numStocks, exe.dtype());
       base = reinterpret_cast<uintptr_t>(arr.data());
       ret[key] = nb::cast(std::move(arr));
     }
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 3a010ea..5ac5907 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -96,7 +96,8 @@ def _sts_reblock(flat: np.ndarray, blocking: int) -> np.ndarray:
         flat.reshape((T, S // blocking, blocking)).transpose((1, 0, 2)))
 
 
-def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
+def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
+              gpu_sm_fill_factor=None):
     """Backend-aware `kr.runGraph`.  CPU path is a pass-through; GPU
     path moves numpy inputs to cupy, runs, syncs, and copies results
     back into the caller-supplied numpy outputs (if any).  Returns the
@@ -105,7 +106,13 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
     STs-blocked (3-D) inputs are transparently unblocked to TS before
     launch; outputs are re-blocked to match.  The matching `compileit`
     wrapper has already rewritten the function's layout attr to `TS`,
-    so the kunir codegen never sees `STs`."""
+    so the kunir codegen never sees `STs`.
+
+    `gpu_sm_fill_factor` (GPU only) overrides the runtime's chunk-grid
+    heuristic — pass `0.0` to force a single time chunk, useful when
+    a test asserts bit-exactness against a single-pass reference (the
+    multi-chunk Kahan restart introduces ≤1 ulp drift).
+    """
     if not GPU_MODE:
         return kr.runGraph(executor, modu, inputs, cur_time, length,
                             outputs if outputs is not None else {})
@@ -123,8 +130,10 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
             ts_inputs[k] = v
 
     gpu_inputs = {k: cp.asarray(v) for k, v in ts_inputs.items()}
-    ret = executor.runGraph(modu, gpu_inputs,
-                              cur_time=cur_time, length=length)
+    rg_kwargs = {"cur_time": cur_time, "length": length}
+    if gpu_sm_fill_factor is not None:
+        rg_kwargs["sm_fill_factor"] = gpu_sm_fill_factor
+    ret = executor.runGraph(modu, gpu_inputs, **rg_kwargs)
     executor.synchronize()
 
     out_np = {}
@@ -149,18 +158,14 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
     "test_corrwith",
     "test_aggregrate",
     "test_runtime",
-    "test_avg_stddev_TS",      # double dtype
-    "test_rank2",              # double dtype
-    "test_rank029",            # double dtype
     "test_ema",                # ExpMovingAvg not in CodegenMLIR
     "test_ema_init",           # same
     "test_argmin_issue19",     # ReduceArgMin / ReduceRank not in CodegenMLIR
     "test_aligned",            # CPU-only shape-error check
-    "test_skew_kurt",          # double + WindowedSkew/Kurt
+    "test_skew_kurt",          # WindowedSkew/Kurt decompose not GPU-ready
     "test_loop_index",         # WindowedMaxDrawdown / WindowLoopIndex
-    "test_covar",              # double + WindowedCovariance/Correlation
-    "test_quantile",           # double + SkipList
-    "test_large_rank",         # double + large-window SkipList
+    "test_quantile",           # SkipList
+    "test_large_rank",         # SkipList (TsRank/etc. with large window)
     "test_stream_double",
     "test_repro_crash_gh_issue_71",
     "test_generic_cross_sectional",
@@ -171,10 +176,15 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None):
 # `compileit` runs on the GPU side — keeps the build green even though
 # most check_xxx entries still produce unsupported kunir.
 _GPU_LIB_NAMES = {
-    "avg_and_stddev",   # WindowedAvg + WindowedStddev (Sqrt + FBW over input)
-    "test_rank",        # cross-sectional Rank (external cs_rank kernel)
-    "test_log",         # float32 only — float64 call gated below
-    "test_pow",         # Pow → Exp(Log(...) * expo) + Sqrt special-case
+    "avg_and_stddev",       # WindowedAvg + WindowedStddev (Sqrt + FBW)
+    "avg_and_stddev_TS",    # same, double dtype, TS layout
+    "test_rank",            # cross-sectional Rank (external cs_rank_f32)
+    "test_rank2",           # Add + Rank, double dtype (cs_rank_f64)
+    "test_rank_alpha029",   # Rank chain + WindowedSum, double
+    "test_log",             # float32
+    "test_log64",           # float64
+    "test_pow",             # Pow → Exp(Log(x) * expo) + Sqrt special-case
+    "test_covar",           # WindowedCovariance + WindowedCorrelation, double
 }
 
 
@@ -187,11 +197,6 @@ def _run(fn, *args, **kwargs):
         if name in _GPU_SKIP_TESTS:
             print(f"[skip on GPU] {name}")
             return
-        # test_log(lib, dtype, name): GPU only has f32 kunir today;
-        # the f64 invocation has to skip.
-        if name == "test_log" and len(args) >= 2 and args[1] == "float64":
-            print(f"[skip on GPU] {name} {args[1]}")
-            return
     fn(*args, **kwargs)
 
 def test_aggregrate(dtype):
@@ -393,8 +398,8 @@ def test_avg_stddev_TS(lib):
     expected_mean = df.rolling(10).mean().to_numpy().transpose()
     expected_stddev = df.rolling(10).std().to_numpy().transpose()
     blocked = np.ascontiguousarray(inp.transpose())
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": blocked}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": blocked}, 0, 20)
     outmean = out["ou1"].transpose()
     outstd = out["ou2"].transpose()
     np.testing.assert_allclose(outmean, expected_mean, rtol=1e-6, equal_nan=True)
@@ -420,8 +425,8 @@ def test_covar(lib):
     df2 = pd.DataFrame(inp2)
     expected_covar = df.rolling(10).cov(df2).to_numpy()
     expected_corr = df.rolling(10).corr(df2).to_numpy()
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp, "b": inp2}, 0, 200)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp, "b": inp2}, 0, 200)
     outcovar = out["ou1"]
     outcorr = out["ou2"]
     np.testing.assert_allclose(outcovar, expected_covar, rtol=1e-6, equal_nan=True)
@@ -616,8 +621,8 @@ def compute(stocks):
         df = df + df
         expected = (df.rank(pct=True, axis = 1) + df).to_numpy().transpose()
         blocked = np.ascontiguousarray(inp.transpose())
-        executor = kr.createSingleThreadExecutor()
-        out = kr.runGraph(executor, modu, {"a": blocked}, 0, 200)
+        executor = createSingleThreadExecutor()
+        out = runGraph(executor, modu, {"a": blocked}, 0, 200)
         output = out["out"].transpose()
         # print(expected[:,0])
         # print(output[:,0])
@@ -659,8 +664,15 @@ def compute(stocks):
         inner = inner.to_numpy().transpose()
         expected = expected.to_numpy().transpose()
         blocked = np.ascontiguousarray(inp.transpose())
-        executor = kr.createSingleThreadExecutor()
-        out = kr.runGraph(executor, modu, {"a": blocked}, 0, 300)
+        executor = createSingleThreadExecutor()
+        # The outer Rank(WindowedSum(...)) is sensitive to near-ties:
+        # GPU multi-chunk Kahan drifts ≤1 ulp in the inner sum, which
+        # can flip cross-sectional tie-breaking and shift rank buckets
+        # by 0.025-0.05.  Force single-chunk so the Kahan state runs
+        # uninterrupted — perf is irrelevant for a correctness test
+        # and this restores bit-exactness with pandas.
+        out = runGraph(executor, modu, {"a": blocked}, 0, 300,
+                        gpu_sm_fill_factor=0.0 if GPU_MODE else None)
         output1 = out["ou1"].transpose()
         output2 = out["ou2"].transpose()
         np.set_printoptions(precision=60)

From fa9267ab8c4f40d2dc9a9f31ed4c55aa58a6c03a Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 17 May 2026 23:15:53 -0700
Subject: [PATCH 34/59] argmin/max/tsrank

---
 KunQuant/jit/cuda.py             |  15 ++-
 KunQuant/ops/CompOp.py           |  16 ++-
 KunQuant/passes/CodegenMLIR.py   |  16 +++
 doc/Customize.md                 |   1 +
 mlir/include/KunIr/KunIrOps.td   |  37 ++++++
 mlir/lib/KunGpu/KunGpuToLLVM.cpp |  38 +++++-
 mlir/lib/KunIr/KunIrOps.cpp      |   3 +
 mlir/lib/KunIr/KunIrToKunGpu.cpp | 219 ++++++++++++++++++++++++++++---
 mlir/lib/Python/IRBuilder.cpp    |  10 ++
 tests/test_runtime.py            |  18 +--
 10 files changed, 333 insertions(+), 40 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 53ac70f..87c09f9 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -145,15 +145,20 @@ def _resolve_vector_size(kcfg: KunCompilerConfig) -> int:
 def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
     """`Driver.optimize`'s `options` dict for the GPU path.
 
-    `blocking_len` is needed by some decompose paths (skip-list cutoff
-    in WindowedMin/Max).  Everything else — including `no_fast_stat` —
-    is taken verbatim from `kcfg.options`; we do not force `no_fast_stat`
-    here.  If the user wants the GPU-safe default, they should set
-    `no_fast_stat=True` in `kcfg.options` themselves.
+    `blocking_len` is needed by some decompose paths (it's also the
+    skip-list / naive cost-model knob).  `kcfg.options` flows through
+    verbatim — including `no_fast_stat`, `opt_reduce`, `fast_log`,
+    all of which the GPU lowering now supports.
+
+    `no_skip_list=True` is forced unconditionally and overrides any
+    user-provided value: the kunir codegen has no lowering for
+    `SkipList*` ops, so the naive `ForeachBackWindow + Reduce*` path
+    is the only one that lowers on GPU.
     """
     opts: dict = {"blocking_len": _resolve_vector_size(kcfg)}
     if kcfg.options:
         opts.update(kcfg.options)
+    opts["no_skip_list"] = True
     return opts
 
 
diff --git a/KunQuant/ops/CompOp.py b/KunQuant/ops/CompOp.py
index 934bc30..fd75bd2 100644
--- a/KunQuant/ops/CompOp.py
+++ b/KunQuant/ops/CompOp.py
@@ -11,7 +11,12 @@
 def _is_fast_stat(opt: dict, attrs: dict) -> bool:
     return not opt.get("no_fast_stat", True) and not attrs.get("no_fast_stat", False)
 
-def _decide_use_skip_list(window: int, blocking_len: int) -> bool:
+def _decide_use_skip_list(options: dict, window: int, blocking_len: int) -> bool:
+    # GPU lowering doesn't implement SkipList ops; the caller can force
+    # the naive ForeachBackWindow path with `options["no_skip_list"]`
+    # regardless of window/blocking_len cost.
+    if options.get("no_skip_list", False):
+        return False
     naive_cost = window
     skip_list_cost = math.log2(window) * blocking_len * 5
     return skip_list_cost < naive_cost
@@ -65,7 +70,7 @@ def on_skip_list(self, skplist: SkipListState, cur: OpBase) -> OpBase:
     def decompose(self, options: dict) -> List[OpBase]:
         window = self.attrs["window"]
         blocking_len = options["blocking_len"]
-        if _decide_use_skip_list(window, blocking_len):
+        if _decide_use_skip_list(options, window, blocking_len):
             b = Builder(self.get_parent())
             with b:
                 newv = self.inputs[0]
@@ -469,7 +474,7 @@ class TsArgMax(WindowedReduce):
     def decompose(self, options: dict) -> List[OpBase]:
         window = self.attrs["window"]
         blocking_len = options["blocking_len"]
-        if _decide_use_skip_list(window, blocking_len):
+        if _decide_use_skip_list(options, window, blocking_len):
             b = Builder(self.get_parent())
             with b:
                 TsArgMin(0-self.inputs[0], window)
@@ -690,6 +695,11 @@ def required_input_window(self) -> int:
         return self.attrs["window"] + 1
     
     def decompose(self, options: dict) -> List[OpBase]:
+        if options.get("no_skip_list", False):
+            raise RuntimeError(
+                "WindowedQuantile has no non-skip-list decompose path; "
+                "it cannot run under options[\"no_skip_list\"]=True "
+                "(e.g. on the GPU backend)")
         b = Builder(self.get_parent())
         window = self.attrs["window"]
         v = self.inputs[0]
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 6d55ada..44c93aa 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -38,6 +38,7 @@
 )
 from KunQuant.ops.ReduceOp import (
     ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
+    ReduceArgMax, ReduceArgMin, ReduceRank,
 )
 from KunQuant.ops.MiscOp import (
     BackRef, FastWindowedSum,
@@ -72,6 +73,13 @@
 _REDUCE = {
     ReduceAdd: "reduce_add", ReduceMul: "reduce_mul",
     ReduceMax: "reduce_max", ReduceMin: "reduce_min",
+    ReduceArgMin: "reduce_argmin", ReduceArgMax: "reduce_argmax",
+}
+# Reduces that need a 2nd input (the outer-scope "current" value).
+# `ReduceRank(iter_val, current)` is the only one today; kept as a separate
+# table so `_emit_reduction` can dispatch without conflating arity.
+_REDUCE_WITH_CURRENT = {
+    ReduceRank: "reduce_rank",
 }
 
 
@@ -196,6 +204,14 @@ def _emit_reduction(op: ReductionOp,
                      ir: KunMLIR.IRBuilder,
                      val_map: Dict[OpBase, KunMLIR.Value]) -> KunMLIR.Value:
     cls = type(op)
+    if cls in _REDUCE_WITH_CURRENT:
+        # ReduceRank(iter_val, current): 2 inputs.
+        if len(op.inputs) != 2:
+            raise NotImplementedError(
+                f"CodegenMLIR: {cls.__name__} expects 2 inputs (iter, "
+                f"current); got {len(op.inputs)} (op = {op})")
+        return getattr(ir, _REDUCE_WITH_CURRENT[cls])(
+            val_map[op.inputs[0]], val_map[op.inputs[1]])
     if cls not in _REDUCE:
         raise NotImplementedError(
             f"CodegenMLIR: reduction {cls.__name__} not supported yet "
diff --git a/doc/Customize.md b/doc/Customize.md
index 21c946d..5b4a8f5 100644
--- a/doc/Customize.md
+++ b/doc/Customize.md
@@ -137,6 +137,7 @@ The `CppCompilerConfig` controls how KunQuant calls the C++ compiler. To choose
 | opt_reduce | optimize WindowedSum by rolling sum algorithm |  bool  |  If in stream mode, False. Otherwise, True  |
 | fast_log | Use KunQuant's implementation of math log function instead of `std::log` |  bool  |  True  |
 | no_fast_stat | Disable fast rolling algorithm for statistics functions like stddev/corr/etc. Setting this flag to True may help to get better precision with the cost of performance. KunQuant will warn the precision issue if `options['no_fast_stat']==False`. To disable the warning and set no_fast_stat to False, set `options['no_fast_stat']=='no_warn'` |  bool or Literal\["no_warn"\]  |  If dtype is float or in stream mode, True. Otherwise, False |
+| no_skip_list | Disable the skip-list decompose path for large-window WindowedMin/WindowedMax/TsArgMin/TsArgMax/TsRank, falling back to the naive `ForeachBackWindow + Reduce*` lowering regardless of window/blocking_len cost.  `WindowedQuantile` has no non-skip-list path and will raise when this is set.  Set automatically by the GPU backend (`KunQuant.jit.cuda`) because the kunir codegen does not lower `SkipList*` ops. |  bool  |  False (CPU); forced True on GPU |
 
 ## Specifing Memory layouts and data types and enabling AVX512
 
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 543b59b..62d09c7 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -350,6 +350,43 @@ def KunIr_ReduceMinOp : KunIr_ReduceOp<"reduce_min"> {
   let summary = "Minimum reduction over the back window";
 }
 
+//===----------------------------------------------------------------------===//
+// Multi-accumulator reductions — NaN-propagating, used by TsArgMin/Max/TsRank.
+// These don't fit the single-state `ReduceArithInterface` (argmin/max track
+// both the running best value *and* its window-relative position; rank tracks
+// less-count + equal-count).  The FBW lowering in `KunIrToKunGpu.cpp`
+// special-cases them to emit the matching N-iter-arg scf.for body.
+//===----------------------------------------------------------------------===//
+
+def KunIr_ReduceArgMinOp : KunIr_Op<"reduce_argmin",
+    [Pure, SameOperandsAndResultType]> {
+  let summary = "ArgMin reduction — window-relative index (window-1-w) of "
+                  "the smallest element; NaN propagates.";
+  let arguments = (ins KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($value) attr-dict";
+}
+def KunIr_ReduceArgMaxOp : KunIr_Op<"reduce_argmax",
+    [Pure, SameOperandsAndResultType]> {
+  let summary = "ArgMax reduction — window-relative index (window-1-w) of "
+                  "the largest element; NaN propagates.";
+  let arguments = (ins KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($value) attr-dict";
+}
+def KunIr_ReduceRankOp : KunIr_Op<"reduce_rank",
+    [Pure, SameOperandsAndResultType]> {
+  let summary = "Per-window cross-sectional rank of `current` against the "
+                  "iterated window values: less_count + (eq_count + 1) / 2.";
+  let arguments = (ins KunIr_AnyTs:$value, KunIr_AnyTs:$current);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$value `,` $current `:` type($value) `,` type($current) attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // BackRef — read input value at t - window
 //
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index 1fc8c3c..f03b32b 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -622,16 +622,48 @@ struct TsGetPattern : OpConversionPattern<TsGetOp> {
       rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
     } else {
       // ── global ts (function arg, TxS layout) ──────────────────────
-      //   effective time = (enclosing scf.for iv) − offset
-      //   load gmem[effTime * num_stocks + stock_id]
+      // Load gmem[(timeIdx - offset) * num_stocks + sid].  When offset
+      // is a known zero (the common ts.get for current time) we skip
+      // the bounds guard; otherwise wrap in `scf.if (t >= offset)`
+      // returning NaN out-of-bounds to mirror CPU `InputTS::getWindow`.
       Value timeIdx = getCurrentTimeIdx(op);
       Value offsetIdx = arith::IndexCastOp::create(
           rewriter, loc, idxTy, offsetI32);
+      bool offsetIsZero = false;
+      if (auto a = offsetI32.getDefiningOp<arith::ConstantOp>())
+        offsetIsZero = (llvm::cast<IntegerAttr>(a.getValue()).getInt() == 0);
+      else if (auto l = offsetI32.getDefiningOp<LLVM::ConstantOp>())
+        offsetIsZero = (llvm::cast<IntegerAttr>(l.getValue()).getInt() == 0);
+
       Value gep = gmemGEPWithOffset(rewriter, loc, elemTy, ptrTy, tsPtr,
                                      timeIdx, offsetIdx,
                                      getNumStocksI64(rewriter, op, loc),
                                      idxTy, i64Ty);
-      rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
+      if (offsetIsZero) {
+        rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
+        return success();
+      }
+      Value inRange = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sge, timeIdx, offsetIdx);
+      auto ifOp = scf::IfOp::create(rewriter, loc, TypeRange{elemTy},
+                                       inRange, /*withElseRegion=*/true);
+      {
+        OpBuilder::InsertionGuard g(rewriter);
+        rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
+        Value loaded = LLVM::LoadOp::create(rewriter, loc, elemTy, gep);
+        scf::YieldOp::create(rewriter, loc, loaded);
+      }
+      {
+        OpBuilder::InsertionGuard g(rewriter);
+        rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
+        Value nanV = LLVM::ConstantOp::create(
+            rewriter, loc,
+            llvm::cast<FloatType>(elemTy),
+            rewriter.getFloatAttr(elemTy,
+                                    std::numeric_limits<double>::quiet_NaN()));
+        scf::YieldOp::create(rewriter, loc, nanV);
+      }
+      rewriter.replaceOp(op, ifOp.getResult(0));
     }
     return success();
   }
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index e0c5ccb..bd3baef 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -172,6 +172,9 @@ LogicalResult ReduceAddOp::verify() { return verifyInsideForEachBackWindow(*this
 LogicalResult ReduceMulOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceArgMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceArgMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceRankOp::verify()   { return verifyInsideForEachBackWindow(*this); }
 
 //===----------------------------------------------------------------------===//
 // BackRef + FastWindowedSum — share a verifier (same shape / constraints)
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index d369c64..0e42337 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -85,6 +85,14 @@ struct LowerHelper {
   Value zeroOffsetI32;
   Value outerTimeIdx;   // outer scf.for induction var (index)
   Value outerLoopLb;    // outer scf.for lower bound (index)
+  // Inside a for_each_back_window body: the current window step offset
+  // (window-1-w).  Used by argmin/argmax to record the position index.
+  Value windowedOffsetI32;
+  // Running accumulators for each reduce op in the enclosing FBW body.
+  // Single-state reduce: 1 entry; argmin/max: {best_val, best_idx};
+  // rank: {less_count, eq_count}.  Seeded by FBW pre-loop, updated by
+  // each reduce step, read by scf.yield.
+  llvm::DenseMap<Value, SmallVector<Value, 2>> multiAccs;
 
   // Shared util: look up `v` (a ts SSA value) in tsMap, emit
   // ts.get(handle, offsetI32), return the loaded scalar.  Does NOT touch
@@ -120,6 +128,96 @@ struct LowerHelper {
     return scalar;
   }
 
+  // One step of a multi-state reduce (argmin/argmax/rank).  Mirrors
+  // cpp/Kun/Ops.hpp's step() exactly so CPU and GPU match bit-for-bit
+  // (modulo reduction-order changes).
+  LogicalResult lowerMultiReduce(Operation *op, OpBuilder &b, Location ol) {
+    auto isArgMin = isa<kunir::ReduceArgMinOp>(op);
+    auto isArgMax = isa<kunir::ReduceArgMaxOp>(op);
+    auto isRank   = isa<kunir::ReduceRankOp>(op);
+    assert(isArgMin || isArgMax || isRank);
+
+    KUN_ASSIGN_OR_FAIL(Value elem, getScalar(op->getOperand(0), b, ol));
+    FloatType elemTy = llvm::cast<FloatType>(elem.getType());
+    auto &accs = multiAccs[op->getResult(0)];
+    assert(accs.size() == 2 &&
+           "multi-state reduce must be pre-seeded with 2 iter_args");
+
+    auto fconst = [&](double v) {
+      return arith::ConstantOp::create(b, ol, elemTy,
+                                          b.getFloatAttr(elemTy, v))
+          .getResult();
+    };
+    auto fIsNan = [&](Value v) {
+      return arith::CmpFOp::create(b, ol, arith::CmpFPredicate::UNE, v, v)
+          .getResult();
+    };
+    Value nanF = fconst(std::numeric_limits<double>::quiet_NaN());
+    Value one  = fconst(1.0);
+
+    if (isArgMin || isArgMax) {
+      // accs = {best_val, best_idx}.  Ordered compare so NaN doesn't
+      // trigger the update; NaN is propagated by the final selects.
+      Value bestVal = accs[0];
+      Value bestIdx = accs[1];
+      Value bestIsNan = fIsNan(bestVal);
+      Value elemIsNan = fIsNan(elem);
+      auto pred = isArgMin ? arith::CmpFPredicate::OGT
+                            : arith::CmpFPredicate::OLT;
+      Value cmp = arith::CmpFOp::create(b, ol, pred, bestVal, elem)
+                      .getResult();
+      Value newVal = arith::SelectOp::create(b, ol, cmp, elem, bestVal)
+                          .getResult();
+      // Record the window-relative position (window-1-w) so
+      // TsArgMin = window - ReduceArgMin gives pandas's
+      // np.argmin()+1 convention (1=oldest, window=newest).
+      Value wIdxF = arith::SIToFPOp::create(b, ol, elemTy,
+                                                windowedOffsetI32)
+                        .getResult();
+      Value newIdx = arith::SelectOp::create(b, ol, cmp, wIdxF, bestIdx)
+                          .getResult();
+      Value anyNan = arith::OrIOp::create(b, ol, bestIsNan, elemIsNan)
+                          .getResult();
+      newVal = arith::SelectOp::create(b, ol, anyNan, nanF, newVal)
+                  .getResult();
+      newIdx = arith::SelectOp::create(b, ol, anyNan, nanF, newIdx)
+                  .getResult();
+      accs[0] = newVal;
+      accs[1] = newIdx;
+      return success();
+    }
+
+    // ReduceRank: accs = {less_count, eq_count}; `current` is an
+    // outer-scope ts<f, 1> already in scalarMap.
+    KUN_ASSIGN_OR_FAIL(Value cur, getScalar(op->getOperand(1), b, ol));
+    Value lessCnt = accs[0];
+    Value eqCnt   = accs[1];
+    Value curIsNan  = fIsNan(cur);
+    Value elemIsNan = fIsNan(elem);
+    Value anyNan    = arith::OrIOp::create(b, ol, curIsNan, elemIsNan)
+                          .getResult();
+    Value cmpLess = arith::CmpFOp::create(
+                        b, ol, arith::CmpFPredicate::OLT, elem, cur)
+                        .getResult();
+    Value cmpEq   = arith::CmpFOp::create(
+                        b, ol, arith::CmpFPredicate::OEQ, elem, cur)
+                        .getResult();
+    Value lessP1 = arith::AddFOp::create(b, ol, lessCnt, one).getResult();
+    Value newLess = arith::SelectOp::create(b, ol, cmpLess, lessP1, lessCnt)
+                        .getResult();
+    // NaN routed only into less_count — the final rank extract
+    // (`less + (eq + 1) / 2`, computed after the scf.for) then
+    // propagates NaN out.
+    newLess = arith::SelectOp::create(b, ol, anyNan, nanF, newLess)
+                  .getResult();
+    Value eqP1   = arith::AddFOp::create(b, ol, eqCnt, one).getResult();
+    Value newEq  = arith::SelectOp::create(b, ol, cmpEq, eqP1, eqCnt)
+                        .getResult();
+    accs[0] = newLess;
+    accs[1] = newEq;
+    return success();
+  }
+
   // Lower non-terminator ops in `ops` in definition order.
   //
   // For each op:
@@ -137,11 +235,18 @@ struct LowerHelper {
         KUN_ASSIGN_OR_FAIL(Value operand, getScalar(op->getOperand(0), b, ol));
         scalarMap[op->getResult(0)] = iface.buildScalarOp(b, ol, operand);
       } else if (auto ri = dyn_cast<ReduceArithInterface>(op)) {
+        // Running acc lives in multiAccs[result][0] (see FBW lowering
+        // for the pre-seed); single- and multi-state reduces share
+        // the same storage so scf.yield reads them uniformly.
         KUN_ASSIGN_OR_FAIL(Value elem, getScalar(op->getOperand(0), b, ol));
-        auto it = scalarMap.find(op->getResult(0));
-        assert(it != scalarMap.end() &&
-               "reduce result must be pre-seeded in scalarMap with current acc");
-        it->second = ri.buildAccumOp(b, ol, it->second, elem);
+        auto mit = multiAccs.find(op->getResult(0));
+        assert(mit != multiAccs.end() && mit->second.size() == 1 &&
+               "reduce result must be pre-seeded in multiAccs with current acc");
+        mit->second[0] = ri.buildAccumOp(b, ol, mit->second[0], elem);
+      } else if (isa<kunir::ReduceArgMinOp, kunir::ReduceArgMaxOp,
+                       kunir::ReduceRankOp>(op)) {
+        if (failed(lowerMultiReduce(op, b, ol)))
+          return failure();
       } else if (auto sel = dyn_cast<SelectOp>(op)) {
         KUN_ASSIGN_OR_FAIL(Value cond, getScalar(sel.getCond(),      b, ol));
         KUN_ASSIGN_OR_FAIL(Value tv,   getScalar(sel.getTrueValue(), b, ol));
@@ -379,19 +484,51 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
                               "must be a ts handle");
       }
 
-      // Each yield operand must come from a reduce_* op — collect init values.
+      // Build the iter_args layout: single-state reduce = 1 init,
+      // argmin/max = (best_val, best_idx), rank = (less, eq).
+      struct ReduceSlot {
+        int numAccs;
+        int startIdx;
+      };
+      SmallVector<ReduceSlot> slots; // parallel to yieldOp.getValues()
       SmallVector<Value> initVals;
+      auto elemTyOf = [](Operation *defOp) -> FloatType {
+        return llvm::cast<FloatType>(
+            llvm::cast<TsType>(defOp->getOperand(0).getType()).getElementType());
+      };
+      auto pushConst = [&](FloatType elemTy, double v) {
+        initVals.push_back(arith::ConstantOp::create(
+            fb, ol, elemTy, fb.getFloatAttr(elemTy, v)));
+      };
       for (Value yv : yieldOp.getValues()) {
         auto *defOp = yv.getDefiningOp();
-        auto ri = defOp ? dyn_cast<ReduceArithInterface>(defOp)
-                        : ReduceArithInterface{};
-        if (!ri) {
+        if (!defOp) {
+          return op.emitError("kunir-to-kungpu: for_each_back_window yield "
+                              "operand has no defining op");
+        }
+        ReduceSlot slot{0, (int)initVals.size()};
+        if (auto ri = dyn_cast<ReduceArithInterface>(defOp)) {
+          FloatType elemTy = elemTyOf(defOp);
+          initVals.push_back(arith::ConstantOp::create(
+              fb, ol, ri.getInitValue(elemTy)));
+          slot.numAccs = 1;
+        } else if (isa<kunir::ReduceArgMinOp>(defOp) ||
+                     isa<kunir::ReduceArgMaxOp>(defOp)) {
+          FloatType elemTy = elemTyOf(defOp);
+          double inf = std::numeric_limits<double>::infinity();
+          pushConst(elemTy, isa<kunir::ReduceArgMinOp>(defOp) ? inf : -inf);
+          pushConst(elemTy, 0.0);
+          slot.numAccs = 2;
+        } else if (isa<kunir::ReduceRankOp>(defOp)) {
+          FloatType elemTy = elemTyOf(defOp);
+          pushConst(elemTy, 0.0);
+          pushConst(elemTy, 0.0);
+          slot.numAccs = 2;
+        } else {
           return op.emitError("kunir-to-kungpu: for_each_back_window yield "
                               "operand must come from a reduce_* op");
         }
-        auto elemTy = llvm::cast<FloatType>(
-            llvm::cast<TsType>(defOp->getOperand(0).getType()).getElementType());
-        initVals.push_back(arith::ConstantOp::create(fb, ol, ri.getInitValue(elemTy)));
+        slots.push_back(slot);
       }
 
       // Create inner scf.for %w = 0 to window step 1 iter_args(acc_i = init_i).
@@ -429,6 +566,10 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
             LowerHelper inner{outer.tsMap, outer.scalarMap,
                                 outer.zeroOffsetI32,
                                 outer.outerTimeIdx, outer.outerLoopLb};
+            // Hand the inner helper the current window-step offset
+            // (window-1-w) so multi-state reductions (argmin/argmax)
+            // can use it as the recorded `index`.
+            inner.windowedOffsetI32 = windowedOffset;
             for (auto [i, arg] : llvm::enumerate(body.getArguments())) {
               auto r = inner.getScalarUncached(fwOp.getInputs()[i],
                                                 windowedOffset, ib, il);
@@ -439,8 +580,15 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
               }
               inner.scalarMap[arg] = *r;
             }
-            for (auto [i, yv] : llvm::enumerate(yieldOp.getValues()))
-              inner.scalarMap[yv] = iterArgs[i];
+            // Pre-seed accumulators from iter_args.
+            for (auto [i, yv] : llvm::enumerate(yieldOp.getValues())) {
+              const auto &slot = slots[i];
+              SmallVector<Value, 2> accs;
+              accs.reserve(slot.numAccs);
+              for (int j = 0; j < slot.numAccs; ++j)
+                accs.push_back(iterArgs[slot.startIdx + j]);
+              inner.multiAccs[yv] = std::move(accs);
+            }
 
             if (failed(inner.lowerBlock(body, ib))) {
               innerOk = false;
@@ -448,17 +596,48 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
               return;
             }
 
-            SmallVector<Value> newAccs;
-            for (Value yv : yieldOp.getValues())
-              newAccs.push_back(inner.scalarMap.find(yv)->second);
+            // Yield the updated accumulators back into the iter_args.
+            SmallVector<Value> newAccs(initVals.size());
+            for (auto [i, yv] : llvm::enumerate(yieldOp.getValues())) {
+              const auto &slot = slots[i];
+              const auto &accs = inner.multiAccs[yv];
+              for (int j = 0; j < slot.numAccs; ++j)
+                newAccs[slot.startIdx + j] = accs[j];
+            }
             scf::YieldOp::create(ib, il, newAccs);
           });
       if (!innerOk) return failure();
 
-      // Map for_each_back_window results (scalar reduce accs) to the inner
-      // for's results.
-      for (auto [i, res] : llvm::enumerate(fwOp.getResults()))
-        outer.scalarMap[res] = innerFor.getResult(i);
+      // Project each fwOp result from the inner-for's iter_arg slice:
+      // single-state passes through, argmin/max returns best_idx, rank
+      // computes less + (eq + 1) / 2.
+      OpBuilder::InsertionGuard guardPost(b);
+      b.setInsertionPointAfter(innerFor);
+      for (auto [i, res] : llvm::enumerate(fwOp.getResults())) {
+        const auto &slot = slots[i];
+        Value yv = yieldOp.getValues()[i];
+        auto *defOp = yv.getDefiningOp();
+        Value finalVal;
+        if (slot.numAccs == 1) {
+          finalVal = innerFor.getResult(slot.startIdx);
+        } else if (isa<kunir::ReduceArgMinOp>(defOp) ||
+                     isa<kunir::ReduceArgMaxOp>(defOp)) {
+          finalVal = innerFor.getResult(slot.startIdx + 1);
+        } else {
+          // ReduceRankOp:  less + (eq + 1) / 2
+          Value less = innerFor.getResult(slot.startIdx);
+          Value eq   = innerFor.getResult(slot.startIdx + 1);
+          auto elemTy = llvm::cast<FloatType>(less.getType());
+          Value one = arith::ConstantOp::create(
+              b, ol, elemTy, b.getFloatAttr(elemTy, 1.0));
+          Value two = arith::ConstantOp::create(
+              b, ol, elemTy, b.getFloatAttr(elemTy, 2.0));
+          Value eqp1 = arith::AddFOp::create(b, ol, eq, one);
+          Value half = arith::DivFOp::create(b, ol, eqp1, two);
+          finalVal = arith::AddFOp::create(b, ol, less, half);
+        }
+        outer.scalarMap[res] = finalVal;
+      }
       return success();
     }
 
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 81c52f2..8caea20 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -252,6 +252,12 @@ class IRBuilder {
   Value reduceMulOp(Value x) { return makeReduce<kunir::ReduceMulOp>(x); }
   Value reduceMaxOp(Value x) { return makeReduce<kunir::ReduceMaxOp>(x); }
   Value reduceMinOp(Value x) { return makeReduce<kunir::ReduceMinOp>(x); }
+  Value reduceArgMinOp(Value x) { return makeReduce<kunir::ReduceArgMinOp>(x); }
+  Value reduceArgMaxOp(Value x) { return makeReduce<kunir::ReduceArgMaxOp>(x); }
+  Value reduceRankOp(Value x, Value cur) {
+    // SameOperandsAndResultType — pass x's type as the result type.
+    return kunir::ReduceRankOp::create(b_, b_.getUnknownLoc(), x.getType(), x, cur);
+  }
 
   // ── Finalize ──────────────────────────────────────────────────────
   std::unique_ptr<PyModule> finish() {
@@ -415,6 +421,10 @@ void registerIRBuilder(nb::module_ &m) {
       .def("reduce_mul", &IRBuilder::reduceMulOp, nb::arg("x"))
       .def("reduce_max", &IRBuilder::reduceMaxOp, nb::arg("x"))
       .def("reduce_min", &IRBuilder::reduceMinOp, nb::arg("x"))
+      .def("reduce_argmin", &IRBuilder::reduceArgMinOp, nb::arg("x"))
+      .def("reduce_argmax", &IRBuilder::reduceArgMaxOp, nb::arg("x"))
+      .def("reduce_rank",   &IRBuilder::reduceRankOp,
+            nb::arg("x"), nb::arg("current"))
 
       // Finalize / debug
       .def("to_string", &IRBuilder::toString,
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 5ac5907..2888666 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -160,12 +160,9 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
     "test_runtime",
     "test_ema",                # ExpMovingAvg not in CodegenMLIR
     "test_ema_init",           # same
-    "test_argmin_issue19",     # ReduceArgMin / ReduceRank not in CodegenMLIR
     "test_aligned",            # CPU-only shape-error check
-    "test_skew_kurt",          # WindowedSkew/Kurt decompose not GPU-ready
     "test_loop_index",         # WindowedMaxDrawdown / WindowLoopIndex
     "test_quantile",           # SkipList
-    "test_large_rank",         # SkipList (TsRank/etc. with large window)
     "test_stream_double",
     "test_repro_crash_gh_issue_71",
     "test_generic_cross_sectional",
@@ -185,6 +182,9 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
     "test_log64",           # float64
     "test_pow",             # Pow → Exp(Log(x) * expo) + Sqrt special-case
     "test_covar",           # WindowedCovariance + WindowedCorrelation, double
+    "test_skew",            # WindowedSkew/Kurt (both fast & slow paths)
+    "test_large_rank",      # TsRank/TsArgMin/Max via naive FBW (no_skip_list)
+    "test_argmin",          # TsArgMin/TsRank/WindowedMin small-window
 }
 
 
@@ -474,8 +474,8 @@ def test_large_rank(lib):
     # test with duplicates
     inp[400:410,:] = -1
     # inp[1400:1410,:] = 10
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 2000)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 2000)
     outrank = out["ou1"]
     df = pd.DataFrame(inp)
     expected_rank = df.rolling(200).rank().to_numpy()
@@ -557,8 +557,8 @@ def test_argmin_issue19(lib):
     data = [ 0.6898481863442985, 0.6992020600574415, 0.6992020600574417, 0.6968635916291558, 0.6968635916291558, 0.6968635916291558 ]
     for i in range(6):
         inp[i, :] = data[i]
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 6)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 6)
     df = pd.DataFrame(inp)
     expected =df.rolling(5, min_periods=1).apply(lambda x: x.argmin() + 1, raw=True)
     output = out["ou2"][4:]
@@ -808,8 +808,8 @@ def test_skew_kurt():
     modu = lib.getModule("test_skew")
     assert(modu)
     inp = np.random.rand(20, 24)
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 20)
     output = out["ou2"]
     df = pd.DataFrame(inp)
     expected = df.rolling(5).skew()

From accb202685324ec8eda9d7538d6203fd41c673f0 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 17 May 2026 23:30:12 -0700
Subject: [PATCH 35/59] WindowLoopIndex

---
 KunQuant/passes/CodegenMLIR.py   |  6 +++++-
 mlir/include/KunIr/KunIrOps.td   | 12 ++++++++++++
 mlir/lib/KunIr/KunIrOps.cpp      |  3 +++
 mlir/lib/KunIr/KunIrToKunGpu.cpp | 13 ++++++++++++-
 mlir/lib/Python/IRBuilder.cpp    |  5 +++++
 tests/test_runtime.py            |  6 +++---
 6 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 44c93aa..bbe6b50 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -28,7 +28,7 @@
 
 from KunQuant.Op import (
     OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
-    ReductionOp, SimpleCrossSectionalOp, ConstantOp,
+    WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
@@ -174,6 +174,10 @@ def _emit_simple(op: OpBase,
         v = op.attrs["value"]
         fv = float("nan") if v == "nan" else float(v)
         return ir.constant(fv, ts_1)
+    if isinstance(op, WindowLoopIndex):
+        # Resolved by the kunir → kungpu pass to the enclosing
+        # for_each_back_window's induction variable.
+        return ir.window_loop_index(ts_1)
     if isinstance(op, Accumulator):
         # The Python op's `inputs[0]` is a keep-alive in the graph IR;
         # it does NOT feed the slot.  Only the `name` attr matters at
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 62d09c7..2603679 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -387,6 +387,18 @@ def KunIr_ReduceRankOp : KunIr_Op<"reduce_rank",
     "$value `,` $current `:` type($value) `,` type($current) attr-dict";
 }
 
+//===----------------------------------------------------------------------===//
+// WindowLoopIndex — current FBW step index, 0 = oldest, window-1 = newest.
+// Result is the integer index converted to the function's element type.
+//===----------------------------------------------------------------------===//
+
+def KunIr_WindowLoopIndexOp : KunIr_Op<"window_loop_index", [Pure]> {
+  let summary = "Current step index of the enclosing for_each_back_window";
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "`:` type($result) attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // BackRef — read input value at t - window
 //
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index bd3baef..14248e1 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -175,6 +175,9 @@ LogicalResult ReduceMinOp::verify() { return verifyInsideForEachBackWindow(*this
 LogicalResult ReduceArgMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceArgMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceRankOp::verify()   { return verifyInsideForEachBackWindow(*this); }
+LogicalResult WindowLoopIndexOp::verify() {
+  return verifyInsideForEachBackWindow(*this);
+}
 
 //===----------------------------------------------------------------------===//
 // BackRef + FastWindowedSum — share a verifier (same shape / constraints)
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 0e42337..b4ed53c 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -88,6 +88,9 @@ struct LowerHelper {
   // Inside a for_each_back_window body: the current window step offset
   // (window-1-w).  Used by argmin/argmax to record the position index.
   Value windowedOffsetI32;
+  // Inside a for_each_back_window body: the raw step index `w` (0 to
+  // window-1, 0 = oldest).  Used by `kunir.window_loop_index`.
+  Value windowIdxI32;
   // Running accumulators for each reduce op in the enclosing FBW body.
   // Single-state reduce: 1 entry; argmin/max: {best_val, best_idx};
   // rank: {less_count, eq_count}.  Seeded by FBW pre-loop, updated by
@@ -247,6 +250,12 @@ struct LowerHelper {
                        kunir::ReduceRankOp>(op)) {
         if (failed(lowerMultiReduce(op, b, ol)))
           return failure();
+      } else if (auto wli = dyn_cast<kunir::WindowLoopIndexOp>(op)) {
+        // sitofp(w, elemTy) — `w` is the enclosing scf.for's IV.
+        auto resTsTy = llvm::cast<TsType>(wli.getResult().getType());
+        auto elemTy = llvm::cast<FloatType>(resTsTy.getElementType());
+        scalarMap[wli.getResult()] =
+            arith::SIToFPOp::create(b, ol, elemTy, windowIdxI32).getResult();
       } else if (auto sel = dyn_cast<SelectOp>(op)) {
         KUN_ASSIGN_OR_FAIL(Value cond, getScalar(sel.getCond(),      b, ol));
         KUN_ASSIGN_OR_FAIL(Value tv,   getScalar(sel.getTrueValue(), b, ol));
@@ -568,8 +577,10 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
                                 outer.outerTimeIdx, outer.outerLoopLb};
             // Hand the inner helper the current window-step offset
             // (window-1-w) so multi-state reductions (argmin/argmax)
-            // can use it as the recorded `index`.
+            // can use it as the recorded `index`, and the raw step
+            // index `w` for `kunir.window_loop_index`.
             inner.windowedOffsetI32 = windowedOffset;
+            inner.windowIdxI32 = w_i32;
             for (auto [i, arg] : llvm::enumerate(body.getArguments())) {
               auto r = inner.getScalarUncached(fwOp.getInputs()[i],
                                                 windowedOffset, ib, il);
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 8caea20..4391919 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -258,6 +258,9 @@ class IRBuilder {
     // SameOperandsAndResultType — pass x's type as the result type.
     return kunir::ReduceRankOp::create(b_, b_.getUnknownLoc(), x.getType(), x, cur);
   }
+  Value windowLoopIndexOp(Type ts_ty) {
+    return kunir::WindowLoopIndexOp::create(b_, b_.getUnknownLoc(), ts_ty);
+  }
 
   // ── Finalize ──────────────────────────────────────────────────────
   std::unique_ptr<PyModule> finish() {
@@ -425,6 +428,8 @@ void registerIRBuilder(nb::module_ &m) {
       .def("reduce_argmax", &IRBuilder::reduceArgMaxOp, nb::arg("x"))
       .def("reduce_rank",   &IRBuilder::reduceRankOp,
             nb::arg("x"), nb::arg("current"))
+      .def("window_loop_index", &IRBuilder::windowLoopIndexOp,
+            nb::arg("ts_ty"))
 
       // Finalize / debug
       .def("to_string", &IRBuilder::toString,
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 2888666..e373528 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -161,7 +161,6 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
     "test_ema",                # ExpMovingAvg not in CodegenMLIR
     "test_ema_init",           # same
     "test_aligned",            # CPU-only shape-error check
-    "test_loop_index",         # WindowedMaxDrawdown / WindowLoopIndex
     "test_quantile",           # SkipList
     "test_stream_double",
     "test_repro_crash_gh_issue_71",
@@ -185,6 +184,7 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
     "test_skew",            # WindowedSkew/Kurt (both fast & slow paths)
     "test_large_rank",      # TsRank/TsArgMin/Max via naive FBW (no_skip_list)
     "test_argmin",          # TsArgMin/TsRank/WindowedMin small-window
+    "test_max_drawdown",    # WindowedMaxDrawdown (uses WindowLoopIndex)
 }
 
 
@@ -900,8 +900,8 @@ def test_loop_index():
     modu = lib.getModule("test_max_drawdown")
     assert(modu)
     inp = np.random.rand(20, 24).astype("float32")
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 20)
     output = out["out"]
     
     # reference implementation, from https://stackoverflow.com/a/21059308. Modified for our version of maxdd

From 2f7394c93e663b424b0d99a03200fc062c644b9b Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 18 May 2026 02:59:57 -0700
Subject: [PATCH 36/59] ema and linear regression

---
 KunQuant/Driver.py                    |   4 +
 KunQuant/jit/cuda.py                  |   4 +
 KunQuant/ops/MiscOp.py                |  23 ++-
 KunQuant/passes/CodegenMLIR.py        |  22 ++-
 KunQuant/passes/ExperimentalExpand.py | 255 ++++++++++++++++++++++++++
 KunQuant/passes/__init__.py           |   3 +-
 doc/Operators.md                      |  32 +++-
 mlir/include/KunGpu/KunGpuOps.td      |  10 +-
 mlir/include/KunIr/KunIrOps.td        |  17 +-
 mlir/lib/KunGpu/KunGpuToLLVM.cpp      |   9 +-
 mlir/lib/KunIr/KunIrOps.cpp           |   9 +
 mlir/lib/KunIr/KunIrToKunGpu.cpp      |  10 +-
 mlir/lib/Python/IRBuilder.cpp         |  18 +-
 tests/test_runtime.py                 |   8 +-
 14 files changed, 381 insertions(+), 43 deletions(-)
 create mode 100644 KunQuant/passes/ExperimentalExpand.py

diff --git a/KunQuant/Driver.py b/KunQuant/Driver.py
index 513a75d..5e93469 100644
--- a/KunQuant/Driver.py
+++ b/KunQuant/Driver.py
@@ -29,6 +29,10 @@ def optimize(f: Function, options: dict)->Dict[str, int]:
     # optimize before decompose to let value ranges work
     special_optimize(f, options)
     decompose(f, options)
+    # Experimental: expand stateful ops (ExpMovingAvg / WindowedLinearRegression*)
+    # into Accumulator chains.  No-op on the CPU pipeline (gated on
+    # options["experimental_expand"]); currently enabled by the GPU backend.
+    experimental_expand(f, options)
     expr_fold(f, options)
     special_optimize(f, options)
     expr_fold(f, options)
diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 87c09f9..8253b18 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -159,6 +159,10 @@ def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
     if kcfg.options:
         opts.update(kcfg.options)
     opts["no_skip_list"] = True
+    # Pipeline lowering doesn't know about ExpMovingAvg or the
+    # WindowedLinearRegression* family — turn on the Accumulator-based
+    # expansion pass instead.
+    opts["experimental_expand"] = True
     return opts
 
 
diff --git a/KunQuant/ops/MiscOp.py b/KunQuant/ops/MiscOp.py
index 01ea1af..ba6fa17 100644
--- a/KunQuant/ops/MiscOp.py
+++ b/KunQuant/ops/MiscOp.py
@@ -32,17 +32,32 @@ def generate_step_code(self, idx: str, time_idx: str, inputs: List[str], buf_nam
 class Accumulator(OpBase, GlobalStatefulProducerTrait, MayRequireWholeTime):
     '''
     Accumulator is a stateful op that accumulates the input value over time.
-    It can be used to compute running totals, moving averages, etc.'''
+    It can be used to compute running totals, moving averages, etc.
+
+    `init_val` is the initial scalar stored in the slot before the first
+    time step.  Pass a float (default 0) for a plain numeric init, or the
+    string "nan" for a NaN init (mirrors ConstantOp's "nan" handling).
+    '''
     def __init__(self, v: OpBase, name: str,
-                  is_whole_time_required: bool = False) -> None:
+                  is_whole_time_required: bool = False,
+                  init_val: Union[float, str] = 0) -> None:
+        if isinstance(init_val, str) and init_val != "nan":
+            raise RuntimeError(
+                f"Accumulator init_val str must be 'nan', got {init_val!r}")
         super().__init__([v],
                           [("name", name),
-                           ("whole_time", is_whole_time_required)])
+                           ("whole_time", is_whole_time_required),
+                           ("init_val", init_val)])
     def is_whole_time_required(self) -> bool:
         return self.attrs["whole_time"]
     def get_state_variable_name_prefix(self) -> str:
         return "accu_"
-    
+
+    def generate_init_code(self, idx: str, elem_type: str, simd_lanes: int, inputs: List[str], aligned: bool) -> str:
+        from KunQuant.passes.CodegenCpp import _float_value_to_float
+        init = _float_value_to_float(self.attrs["init_val"], elem_type)
+        return f"{self.get_func_or_class_full_name(elem_type, simd_lanes)} {self.get_state_variable_name_prefix()}{idx} {{ {init} }};"
+
     def generate_step_code(self, idx: str, time_idx: str, inputs: List[str]) -> str:
         return f"auto v{idx} = accu_{idx}.asValue();"
 
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index bbe6b50..0d67716 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -180,16 +180,20 @@ def _emit_simple(op: OpBase,
         return ir.window_loop_index(ts_1)
     if isinstance(op, Accumulator):
         # The Python op's `inputs[0]` is a keep-alive in the graph IR;
-        # it does NOT feed the slot.  Only the `name` attr matters at
-        # the MLIR level — same-name accumulators CSE to one slot.
-        return ir.accumulator(op.attrs["name"], ts_1)
+        # it does NOT feed the slot.  The `name` attr is informational;
+        # each op identifies a distinct slot (kunir.accumulator is not
+        # Pure, so MLIR CSE will not dedup two accumulators).
+        init_v = op.attrs["init_val"]
+        init_f = float("nan") if init_v == "nan" else float(init_v)
+        return ir.accumulator(op.attrs["name"], ts_1, init_f)
     if isinstance(op, SetAccumulator):
-        # Side-effect: returns no SSA value.  ReturnFirstValue is what
-        # keeps this op alive in the Python graph (see MiscOp.py).
-        ir.set_accumulator(val_map[op.inputs[0]],
-                            val_map[op.inputs[1]],
-                            val_map[op.inputs[2]])
-        return None
+        # Side-effecting (writes the slot) but also returns the slot's
+        # new value for the current step (`mask ? value : prev`), so
+        # downstream consumers can use the SetAccumulator's SSA result
+        # directly — matches the CPU C++ SetAccumulator semantics.
+        return ir.set_accumulator(val_map[op.inputs[0]],
+                                   val_map[op.inputs[1]],
+                                   val_map[op.inputs[2]])
     if isinstance(op, ReturnFirstValue):
         # In the Python graph IR, ReturnFirstValue's only job is to keep
         # side-effecting siblings (SetAccumulator etc.) reachable from a
diff --git a/KunQuant/passes/ExperimentalExpand.py b/KunQuant/passes/ExperimentalExpand.py
new file mode 100644
index 0000000..782318f
--- /dev/null
+++ b/KunQuant/passes/ExperimentalExpand.py
@@ -0,0 +1,255 @@
+"""Experimental stateful-op expansion pass (currently GPU-only).
+
+Gated on ``options["experimental_expand"]`` — when False (default), the
+pass returns immediately so the CPU pipeline is untouched.  Runs after
+the first ``decompose`` so user-facing composite ops (e.g.
+``WindowedLinearRegressionSlope``) have already been broken into
+``WindowedLinearRegression`` + per-extractor ``Impl`` ops.
+
+Replaces ops that the kunir codegen doesn't lower directly with
+``Accumulator + Select + SetAccumulator`` chains (and FBW reductions) it
+does support:
+
+* ``ExpMovingAvg(v, span)`` → an ``Accumulator(init_val="nan")`` carrying
+  the running EMA.  The NaN init doubles as the "not yet seeded" sentinel
+  — first non-NaN ``v`` is stored verbatim, subsequent non-NaN ``v`` uses
+  the pandas ``ewm(adjust=False, ignore_na=True)`` update.  An ``__init``
+  Input is not supported yet — the pass raises on encounter.
+
+* ``WindowedLinearRegression(v, window)`` → ``FastWindowedSum`` for the
+  running sum / sum-of-squares, plus a ``ForeachBackWindow`` +
+  ``WindowLoopIndex`` + ``ReduceAdd`` for the position-weighted sum_xy.
+  Intermediate ops are stashed in ``state[lin_op] : List[OpBase]`` so each
+  consumer Impl (``Slope``, ``RSqaure``, ``Resi``) can pick the entries it
+  needs and emit its final formula.
+"""
+
+from typing import Dict, List
+
+from KunQuant.Op import (
+    OpBase, Builder, ConstantOp, ForeachBackWindow, IterValue,
+    WindowedTempOutput, WindowLoopIndex,
+)
+from KunQuant.ops.ElewiseOp import Select, Equals, Not
+from KunQuant.ops.ReduceOp import ReduceAdd
+from KunQuant.ops.MiscOp import (
+    FastWindowedSum, Accumulator, SetAccumulator,
+    ExpMovingAvg, WindowedLinearRegression,
+    WindowedLinearRegressionSlopeImpl,
+    WindowedLinearRegressionRSqaureImpl,
+    WindowedLinearRegressionResiImpl,
+)
+from KunQuant.Stage import Function
+from .Util import kun_pass
+
+
+# ── EMA expansion ───────────────────────────────────────────────────
+
+def _expand_ema(op: ExpMovingAvg) -> OpBase:
+    """Build the Accumulator-based chain inside the current Builder.
+
+    The slot is initialised to NaN, which serves as the "not yet seeded"
+    sentinel: a NaN ``prev`` means we still need to seed with the first
+    non-NaN ``x``.  The SetAccumulator's mask is ``notnan_x``, so NaN
+    inputs leave the slot unchanged (pandas ignore_na=True).
+    """
+    if len(op.inputs) >= 2:
+        raise RuntimeError(
+            "experimental_expand: ExpMovingAvg with an `__init` Input is "
+            "not supported yet on the GPU backend")
+    span  = op.attrs["window"]
+    alpha = 2.0 / (span + 1)
+    x     = op.inputs[0]
+
+    # `is_whole_time_required=True` propagates the kernel's
+    # unreliable_count to the sentinel so the runtime collapses to a
+    # single chunk — EMA's per-stock state can't survive a chunk
+    # boundary reset.
+    prev    = Accumulator(x, f"ema_{span}", init_val="nan",
+                          is_whole_time_required=True)
+    notnan_x  = Equals(x, x)
+    prev_nan  = Not(Equals(prev, prev))
+
+    formula = x * alpha + prev * (1.0 - alpha)
+    #   prev is NaN (still warmup):
+    #     - x non-NaN → seed with x
+    #     - x NaN     → keep NaN (Select returns `x`)
+    #   prev is set:
+    #     - x non-NaN → standard formula
+    #     - x NaN     → carry prev unchanged
+    new_ema = Select(prev_nan, x, Select(notnan_x, formula, prev))
+    # mask = notnan_x: on NaN x we don't touch the slot (preserves
+    # both the NaN-sentinel and the carried prev).  SetAccumulator
+    # returns the slot's new value for this step (mask ? value : prev),
+    # which matches `new_ema` here — use it directly as the EMA result.
+    return SetAccumulator(prev, notnan_x, new_ema)
+
+
+# ── WindowedLinearRegression intermediate state ────────────────────
+
+# Field names in the per-op `state` list returned by `_expand_linreg`.
+# Consumers index by these constants for clarity.
+_LR_SUM_Y    = 0   # FastWindowedSum(v,    window)
+_LR_SUM_YY   = 1   # FastWindowedSum(v*v,  window)
+_LR_SUM_XY   = 2   # Σ_{i=0..window-1} i * v[t-window+1+i]
+_LR_SLOPE    = 3
+_LR_INTERCEPT = 4
+_LR_V        = 5   # original v (for the Resi consumer)
+
+
+def _expand_linreg(op: WindowedLinearRegression) -> List[OpBase]:
+    """Emit running sums + the closed-form slope/intercept for v
+    regressed on the integer position x = 0..window-1 within the window.
+
+    The x positions are treated as constants (i.e. no NaN-aware
+    re-indexing) — for an input with NaN entries the running sums become
+    NaN via the FastWindowedSum / FBW NaN propagation and consumers
+    return NaN through.
+    """
+    window = op.attrs["window"]
+    v      = op.inputs[0]
+
+    # sum_y = rolling sum of v over the window; NaN until window full.
+    sum_y  = FastWindowedSum(v, window)
+    # sum_yy = rolling sum of v² — same pattern over a v*v intermediate.
+    sum_yy = FastWindowedSum(v * v, window)
+    # sum_xy = Σ idx * v where idx is the window position (0=oldest,
+    # window-1=newest).  Express via FBW + WindowLoopIndex + Mul +
+    # ReduceAdd; OOB reads (warmup) return NaN, so sum_xy is NaN until
+    # the window fills.
+    wtemp = WindowedTempOutput(v, window)
+    with ForeachBackWindow(wtemp, window) as each:
+        idx     = WindowLoopIndex(each)
+        val     = IterValue(each, wtemp)
+        contrib = idx * val
+    sum_xy = ReduceAdd(contrib)
+
+    # Compile-time constants for x:
+    #   sum_x  = Σ i  for i in [0, window)       = window*(window-1)/2
+    #   sum_xx = Σ i² for i in [0, window)       = window*(window-1)*(2*window-1)/6
+    # ⇒ denom = window*sum_xx - sum_x² = window²(window-1)(window+1)/12
+    n      = float(window)
+    sum_x  = n * (n - 1) / 2.0
+    denom  = (n * n) * (n - 1.0) * (n + 1.0) / 12.0   # constant; assume window > 1
+    slope     = (sum_xy * n - sum_y * sum_x) / denom
+    intercept = (sum_y - slope * sum_x) / n
+
+    state = [None] * 6
+    state[_LR_SUM_Y]     = sum_y
+    state[_LR_SUM_YY]    = sum_yy
+    state[_LR_SUM_XY]    = sum_xy
+    state[_LR_SLOPE]     = slope
+    state[_LR_INTERCEPT] = intercept
+    state[_LR_V]         = v
+    return state
+
+
+# ── Consumer formulas (one per Impl op) ─────────────────────────────
+
+def _expand_lr_slope(impl: WindowedLinearRegressionSlopeImpl,
+                     state: List[OpBase]) -> OpBase:
+    return state[_LR_SLOPE]
+
+
+def _expand_lr_rsquare(impl: WindowedLinearRegressionRSqaureImpl,
+                       state: List[OpBase], window: int) -> OpBase:
+    # SS_reg = slope² * (window*sum_xx - sum_x²) / window = slope² * denom / window
+    # SS_tot = sum_yy - sum_y²/window
+    # R²     = SS_reg / SS_tot
+    n     = float(window)
+    denom = (n * n) * (n - 1.0) * (n + 1.0) / 12.0
+    slope = state[_LR_SLOPE]
+    ss_reg = (slope * slope) * (denom / n)
+    ss_tot = state[_LR_SUM_YY] - (state[_LR_SUM_Y] * state[_LR_SUM_Y]) / n
+    return ss_reg / ss_tot
+
+
+def _expand_lr_resi(impl: WindowedLinearRegressionResiImpl,
+                    state: List[OpBase], window: int) -> OpBase:
+    # residual at the newest window position (x = window-1):
+    #   v_t - (slope * (window-1) + intercept)
+    pred = state[_LR_SLOPE] * float(window - 1) + state[_LR_INTERCEPT]
+    return state[_LR_V] - pred
+
+
+# ── Pass driver ─────────────────────────────────────────────────────
+
+def _experimental_expand_impl(
+    ops: List[OpBase], options: dict,
+) -> List[OpBase]:
+    # state[lin_op] = list of intermediate ops; consumers pick by index.
+    state: Dict[OpBase, List[OpBase]] = {}
+    replace_map: Dict[OpBase, OpBase] = {}
+    out: List[OpBase] = []
+    changed = False
+
+    for op in ops:
+        op.replace_inputs(replace_map)
+
+        if isinstance(op, ExpMovingAvg):
+            b = Builder(op.get_parent())
+            with b:
+                new_val = _expand_ema(op)
+            out.extend(b.ops)
+            replace_map[op] = new_val
+            changed = True
+            continue
+
+        if isinstance(op, WindowedLinearRegression):
+            b = Builder(op.get_parent())
+            with b:
+                lin_state = _expand_linreg(op)
+            out.extend(b.ops)
+            state[op] = lin_state
+            # The LinearRegression op produces a "state handle" Value
+            # consumed only by its Impl ops, which we lower below via
+            # `state[]` lookup — so we don't keep `op` in `out` and we
+            # don't enter it in `replace_map`.  Consumers find the same
+            # original Python object by identity through `op.inputs[0]`.
+            changed = True
+            continue
+
+        if isinstance(op, WindowedLinearRegressionSlopeImpl):
+            lin_op = op.inputs[0]
+            b = Builder(op.get_parent())
+            with b:
+                new_val = _expand_lr_slope(op, state[lin_op])
+            out.extend(b.ops)
+            replace_map[op] = new_val
+            changed = True
+            continue
+
+        if isinstance(op, WindowedLinearRegressionRSqaureImpl):
+            lin_op = op.inputs[0]
+            window = lin_op.attrs["window"]
+            b = Builder(op.get_parent())
+            with b:
+                new_val = _expand_lr_rsquare(op, state[lin_op], window)
+            out.extend(b.ops)
+            replace_map[op] = new_val
+            changed = True
+            continue
+
+        if isinstance(op, WindowedLinearRegressionResiImpl):
+            lin_op = op.inputs[0]
+            window = lin_op.attrs["window"]
+            b = Builder(op.get_parent())
+            with b:
+                new_val = _expand_lr_resi(op, state[lin_op], window)
+            out.extend(b.ops)
+            replace_map[op] = new_val
+            changed = True
+            continue
+
+        out.append(op)
+
+    return out if changed else None
+
+
+@kun_pass
+def experimental_expand(f: Function, options: dict = {}):
+    if not options.get("experimental_expand", False):
+        return
+    newops = _experimental_expand_impl(f.ops, options)
+    if newops is not None:
+        f.set_ops(newops)
diff --git a/KunQuant/passes/__init__.py b/KunQuant/passes/__init__.py
index 9299a42..97cdf0d 100644
--- a/KunQuant/passes/__init__.py
+++ b/KunQuant/passes/__init__.py
@@ -6,4 +6,5 @@
 from .CodegenCpp import codegen_cpp
 from .InferWindow import infer_window
 from .InferWindow import infer_input_window
-from .MergeLoops import merge_loops
\ No newline at end of file
+from .MergeLoops import merge_loops
+from .ExperimentalExpand import experimental_expand
\ No newline at end of file
diff --git a/doc/Operators.md b/doc/Operators.md
index 70e69d2..888a900 100644
--- a/doc/Operators.md
+++ b/doc/Operators.md
@@ -456,16 +456,40 @@ class Accumulator(OpBase, GlobalStatefulProducerTrait, MayRequireWholeTime):
     '''
     Accumulator is a stateful op that accumulates the input value over time.
     It can be used to compute running totals, moving averages, etc.
-    Set `is_whole_time_required=True` if the accumulator's state can only
-    be reconstructed from the full time history.
+
+    The first positional input `v` is a graph-keepalive only — it does NOT
+    feed the slot.  The slot's value is governed by `init_val` (its initial
+    contents) and by paired `SetAccumulator` ops (which write the slot).
+
+    Parameters:
+        v: keepalive input (any OpBase in the time-step's value graph).
+        name: human-readable label.  Per-op uniqueness is NOT required —
+            each `Accumulator` op identifies a distinct slot, even when two
+            ops share a name (no CSE / dedup).
+        is_whole_time_required: set to True if the accumulator's state
+            can only be reconstructed from the full time history (forces
+            the runtime to collapse to a single chunk).
+        init_val: initial scalar stored in the slot before the first time
+            step.  Pass a Python float (default `0`) for a numeric init,
+            or the string `"nan"` for a NaN init.  NaN init is useful as
+            a "not-yet-seeded" sentinel for ops like EMA.
     '''
     def __init__(self, v: OpBase, name: str,
-                  is_whole_time_required: bool = False) -> None:
+                  is_whole_time_required: bool = False,
+                  init_val: Union[float, str] = 0) -> None:
         pass
 
 class SetAccumulator(OpBase):
     '''
-    Set the value of an Accumulator to a value, if mask is set. Otherwise, it does nothing.
+    Conditionally overwrite an Accumulator's slot.  When `mask` is true at
+    the current time step, stores `value` into the slot; otherwise the slot
+    is unchanged.
+
+    The op also returns the slot's new value for the current step — i.e.
+    `mask ? value : prev_accumulator`.  Downstream consumers can use the
+    SetAccumulator's SSA result directly as the freshly-written value
+    without re-reading the slot.  `accu` must be the result of an
+    `Accumulator` op.
     '''
     def __init__(self, accu: OpBase, mask: OpBase, value: OpBase) -> None:
         pass
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
index 0e522fc..6495469 100644
--- a/mlir/include/KunGpu/KunGpuOps.td
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -92,17 +92,19 @@ def KunGpu_TimeUbOp : KunGpu_Op<"time_ub", [Pure]> {
 // (or `value` for put) must match the ts element type.
 //===----------------------------------------------------------------------===//
 
-def KunGpu_AccumulatorOp : KunGpu_Op<"accumulator", [Pure]> {
+def KunGpu_AccumulatorOp : KunGpu_Op<"accumulator"> {
   let summary = "Allocate a single-slot per-thread accumulator (alloca)";
   let description = [{
     Allocates a per-thread single-slot register backing a `kunir.accumulator`.
     The result is a `ts<T, 1>` handle that ts.put / ts.get treat at offset 0
     only — there is no time dimension and no circular indexing.  The slot is
-    zero-initialised at allocation time.
+    initialised to `init_val` at allocation time (default 0.0).
 
-    Pure with a `name` StrAttr — same-name accumulators CSE to one slot.
+    NOT Pure: each op carries its own `init_val` and identifies a distinct
+    slot, so dedup'ing same-name accumulators would silently merge state.
   }];
-  let arguments = (ins StrAttr:$name);
+  let arguments = (ins StrAttr:$name,
+                       DefaultValuedAttr<F64Attr, "0.0">:$init_val);
   let results = (outs KunIr_AnyTs:$result);
   let assemblyFormat = "$name `:` type($result) attr-dict";
 }
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 2603679..44ea165 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -193,8 +193,9 @@ def KunIr_ConstantOp : KunIr_Op<"constant", [Pure]> {
 // load and store its current value.  The handle storage is owned by a
 // single LLVM alloca after kungpu-to-llvm lowering.
 //
-// `kunir.accumulator` is Pure with a `name` StrAttr — two accumulators
-// sharing the same name CSE to a single handle (single backing slot).
+// `kunir.accumulator` is NOT Pure: each op identifies a distinct slot
+// with its own `init_val`, so CSE-ing two accumulators to one slot would
+// silently merge their state.
 //
 // `kunir.set_accumulator` is NOT Pure: writing to the slot is a side
 // effect and must not be CSE'd or hoisted past dependent reads.  When the
@@ -202,9 +203,10 @@ def KunIr_ConstantOp : KunIr_Op<"constant", [Pure]> {
 // overwritten with `value`; otherwise it retains the previous value.
 //===----------------------------------------------------------------------===//
 
-def KunIr_AccumulatorOp : KunIr_Op<"accumulator", [Pure]> {
+def KunIr_AccumulatorOp : KunIr_Op<"accumulator"> {
   let summary = "Stateful single-slot scalar register (read via ts.get @0)";
-  let arguments = (ins StrAttr:$name);
+  let arguments = (ins StrAttr:$name,
+                       DefaultValuedAttr<F64Attr, "0.0">:$init_val);
   let results = (outs KunIr_AnyTs:$result);
   let hasVerifier = 1;
   let assemblyFormat = "$name `:` type($result) attr-dict";
@@ -217,13 +219,18 @@ def KunIr_SetAccumulatorOp : KunIr_Op<"set_accumulator"> {
     `mask` is true at the current time step, stores `value` into the
     accumulator slot; otherwise the slot is unchanged.  Side-effecting
     (NOT Pure): never CSE / dedup.
+
+    Returns the slot's new value for the current step — i.e.
+    `mask ? value : prev_accumulator`.  Downstream consumers can read
+    this directly without a second `ts.get` of the slot.
   }];
   let arguments = (ins KunIr_AnyTs:$acc,
                        KunIr_AnyTs:$mask,
                        KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
   let hasVerifier = 1;
   let assemblyFormat =
-    "$acc `,` $mask `,` $value `:` type($acc) `,` type($mask) `,` type($value) attr-dict";
+    "$acc `,` $mask `,` $value `:` type($acc) `,` type($mask) `,` type($value) `->` type($result) attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index f03b32b..ef2f833 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -527,7 +527,7 @@ struct AccumulatorPattern : OpConversionPattern<kungpu::AccumulatorOp> {
       return rewriter.notifyMatchFailure(
           op, "kungpu.accumulator must be inside a gpu.func");
 
-    // Alloca + zero-init at function entry so the slot is well-defined
+    // Alloca + init_val-init at function entry so the slot is well-defined
     // before the time loop begins.
     Value bufPtr;
     {
@@ -537,9 +537,10 @@ struct AccumulatorPattern : OpConversionPattern<kungpu::AccumulatorOp> {
       Value c1_i32 = LLVM::ConstantOp::create(
           rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
       bufPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, elemTy, c1_i32);
-      Value zero = LLVM::ConstantOp::create(
-          rewriter, loc, elemTy, rewriter.getZeroAttr(elemTy));
-      LLVM::StoreOp::create(rewriter, loc, zero, bufPtr);
+      double initVal = op.getInitVal().convertToDouble();
+      Value initCst = LLVM::ConstantOp::create(
+          rewriter, loc, elemTy, rewriter.getFloatAttr(elemTy, initVal));
+      LLVM::StoreOp::create(rewriter, loc, initCst, bufPtr);
     }
 
     // posPtr = null → ts.get / ts.put treat as accumulator (slot 0 only).
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 14248e1..9bc7c8a 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -271,6 +271,15 @@ LogicalResult SetAccumulatorOp::verify() {
       llvm::cast<IntegerType>(maskTy.getElementType()).getWidth() != 1)
     return emitOpError("mask element type must be i1, got '")
            << maskTy.getElementType() << "'";
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+  if (resultTy.getElementType() != accTy.getElementType())
+    return emitOpError("result element type '")
+           << resultTy.getElementType()
+           << "' must match accumulator element type '"
+           << accTy.getElementType() << "'";
+  if (resultTy.getMaxLookback() != 1)
+    return emitOpError("result maxLookback must be 1, got ")
+           << resultTy.getMaxLookback();
   return success();
 }
 
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index b4ed53c..339d35d 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -657,7 +657,8 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     // (via getScalar → kungpu.ts.get @ offset 0) resolve to the slot.
     if (auto acc = dyn_cast<kunir::AccumulatorOp>(op)) {
       auto ka = kungpu::AccumulatorOp::create(
-          b, ol, acc.getResult().getType(), acc.getNameAttr());
+          b, ol, acc.getResult().getType(), acc.getNameAttr(),
+          acc.getInitValAttr());
       outer.tsMap[acc.getResult()] = ka.getResult();
       return success();
     }
@@ -665,6 +666,8 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
     // kunir.set_accumulator → scf.if (mask) { kungpu.ts.put %acc, %value }
     // inside the outer time loop.  mask and value are loaded at offset 0
     // (current time step) via the standard scalarMap-cached getScalar.
+    // The op's SSA result = `mask ? value : prev_slot` — emitted as an
+    // arith.select and stashed in scalarMap for downstream consumers.
     if (auto sa = dyn_cast<kunir::SetAccumulatorOp>(op)) {
       auto accIt = outer.tsMap.find(sa.getAcc());
       if (accIt == outer.tsMap.end())
@@ -674,10 +677,15 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
                          outer.getScalar(sa.getMask(),  fb, ol));
       KUN_ASSIGN_OR_FAIL(Value valueScalar,
                          outer.getScalar(sa.getValue(), fb, ol));
+      KUN_ASSIGN_OR_FAIL(Value prevScalar,
+                         outer.getScalar(sa.getAcc(),   fb, ol));
+      Value newScalar = arith::SelectOp::create(
+          fb, ol, maskScalar, valueScalar, prevScalar);
       auto ifOp = scf::IfOp::create(fb, ol, /*resultTypes=*/TypeRange{},
                                          maskScalar, /*withElseRegion=*/false);
       OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
       TsPutOp::create(ib, ol, accIt->second, valueScalar);
+      outer.scalarMap[sa.getResult()] = newScalar;
       return success();
     }
 
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 4391919..8965d22 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -176,12 +176,14 @@ class IRBuilder {
   }
 
   // ── Accumulator / SetAccumulator ───────────────────────────────
-  Value accumulatorOp(std::string name, Type tsTy) {
+  Value accumulatorOp(std::string name, Type tsTy, double initVal) {
     return kunir::AccumulatorOp::create(b_, b_.getUnknownLoc(), tsTy,
-                                            b_.getStringAttr(name));
+                                            b_.getStringAttr(name),
+                                            b_.getF64FloatAttr(initVal));
   }
-  void setAccumulatorOp(Value acc, Value mask, Value value) {
-    kunir::SetAccumulatorOp::create(b_, b_.getUnknownLoc(), acc, mask, value);
+  Value setAccumulatorOp(Value acc, Value mask, Value value) {
+    return kunir::SetAccumulatorOp::create(
+        b_, b_.getUnknownLoc(), acc.getType(), acc, mask, value);
   }
 
   // ── Windowed buffer materialization ───────────────────────────────
@@ -388,13 +390,15 @@ void registerIRBuilder(nb::module_ &m) {
             "ts<T, 1>).  Pass float('nan') for NaN.")
 
       .def("accumulator", &IRBuilder::accumulatorOp,
-            nb::arg("name"), nb::arg("type"),
+            nb::arg("name"), nb::arg("type"), nb::arg("init_val") = 0.0,
             "Build a kunir.accumulator with the given name and ts<T, 1> "
-            "result type.  Same-name accumulators CSE to a single slot.")
+            "result type.  `init_val` is the initial scalar stored in the "
+            "slot before the first time step (pass float('nan') for NaN).")
       .def("set_accumulator", &IRBuilder::setAccumulatorOp,
             nb::arg("acc"), nb::arg("mask"), nb::arg("value"),
             "Conditionally store `value` into `acc` when `mask` is true. "
-            "Side-effecting; returns no SSA value.")
+            "Side-effecting; returns the slot's new value for the current "
+            "step (`mask ? value : prev_accumulator`).")
 
       .def("select", &IRBuilder::selectOp,
             nb::arg("cond"), nb::arg("true_value"), nb::arg("false_value"))
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index e373528..f4194d7 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -158,8 +158,7 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
     "test_corrwith",
     "test_aggregrate",
     "test_runtime",
-    "test_ema",                # ExpMovingAvg not in CodegenMLIR
-    "test_ema_init",           # same
+    "test_ema_init",           # __init Input not supported yet
     "test_aligned",            # CPU-only shape-error check
     "test_quantile",           # SkipList
     "test_stream_double",
@@ -185,6 +184,7 @@ def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
     "test_large_rank",      # TsRank/TsArgMin/Max via naive FBW (no_skip_list)
     "test_argmin",          # TsArgMin/TsRank/WindowedMin small-window
     "test_max_drawdown",    # WindowedMaxDrawdown (uses WindowLoopIndex)
+    "test_ema",             # ExpMovingAvg (expanded by experimental_expand)
 }
 
 
@@ -504,8 +504,8 @@ def test_ema(lib):
     assert(modu)
     inp = np.random.rand(20, 24).astype("float32")
     inp[5,:] = np.nan
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 20)
     output = out["ou2"]
     df = pd.DataFrame(inp)
     expected = RefExpMovingAvg(df)

From 55f9c7a9bec6669bc85615ce8ade41baa14a7384 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 18 May 2026 23:23:48 -0700
Subject: [PATCH 37/59] alpha158. Fix Output(Temp(...))

---
 KunQuant/jit/cuda.py                  |  30 ++---
 KunQuant/passes/CodegenMLIR.py        |   8 ++
 KunQuant/passes/ExperimentalExpand.py |   5 +-
 KunQuant/passes/TempWindowElim.py     |  17 +++
 cpp/Kun/Ops.hpp                       |   7 +-
 mlir/include/KunCuda/Runtime.h        |  11 ++
 mlir/include/KunIr/KunIrOps.td        |  19 +++
 mlir/lib/KunIr/KunIrOps.cpp           |  17 +++
 mlir/lib/KunIr/KunIrToKunGpu.cpp      |  64 +++++++++-
 mlir/lib/Python/IRBuilder.cpp         |  16 +++
 mlir/lib/Python/MlirBinding.cpp       |  17 ++-
 tests/test.py                         |  47 +++++++-
 tests/test_alpha158.py                | 164 +++++++++++++++++++++-----
 13 files changed, 368 insertions(+), 54 deletions(-)

diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 8253b18..932618a 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -207,18 +207,16 @@ def _graph_io_names(f: Function):
 
 
 def _run_full_pipeline(f: Function, kcfg: KunCompilerConfig):
-    """Same pass pipeline the CPU `compileit` runs:
-
-        optimize  →  do_partition  →  post_optimize
-
-    Returns the list of post-partition Functions that the translator
-    should walk (one kunir.func per Function).  Mutates `f` in place.
+    """Run optimize / partition / post_optimize.  Returns
+    `(impl, global_unreliable)`; the second is a pre-partition
+    `infer_window` snapshot keyed by Output name.  Mutates `f`.
     """
     options = _gpu_pass_options(kcfg)
     optimize(f, options)
+    global_unreliable = infer_window(f, options)
     _mainf, impl = do_partition(f, kcfg.partition_factor, options)
     post_optimize(impl, options)
-    return impl
+    return impl, global_unreliable
 
 
 def _translate_partitions(impl, kcfg: KunCompilerConfig,
@@ -246,15 +244,10 @@ def _translate_partitions(impl, kcfg: KunCompilerConfig,
     dtype = _to_dtype_token(kcfg.dtype)
     externals = []
     for sub in impl:
-        # Per-partition warmup: max windowed-chain depth from any input
-        # to any output of THIS partition.  Earlier partitions have already
-        # written their (post-warmup) values into the shared device buffers
-        # by the time this kernel runs, so we don't accumulate their
-        # unreliable counts here.  infer_window walks back to Input ops
-        # of the partition; cross-partition deps stop at those Inputs.
-        # If any op in this partition requires the whole time history,
-        # override the inferred warmup with the sentinel so the runtime
-        # collapses this kernel to a single chunk.
+        # Per-kernel warmup is partition-local: the runtime serialises
+        # kernel launches so an upstream kernel's reliable writes are
+        # already in place by the time a downstream kernel reads.  Each
+        # kernel's chunk grid only needs to cover its own local warmup.
         if any(isinstance(op, MayRequireWholeTime)
                 and op.is_whole_time_required()
                 for op in sub.ops):
@@ -288,7 +281,7 @@ def compile_func(f: Function, kcfg: KunCompilerConfig,
     toolkit_path = find_cuda_toolkit(ccfg.toolkit_path)
 
     graph_inputs, graph_outputs = _graph_io_names(f)
-    impl = _run_full_pipeline(f, kcfg)
+    impl, global_unreliable = _run_full_pipeline(f, kcfg)
     mod, externals = _translate_partitions(impl, kcfg, ccfg)
 
     return KunMLIR.compile(
@@ -305,6 +298,7 @@ def compile_func(f: Function, kcfg: KunCompilerConfig,
         # default to 1 — but the cs_rank launch uses it to size
         # blockDim, so feed the config value through.
         warps_per_cta=ccfg.warps_per_cta,
+        output_unreliable=global_unreliable,
     )
 
 
@@ -369,6 +363,6 @@ def to_mlir(f: Function, kcfg: KunCompilerConfig,
     as `compile_func`)."""
     _validate_kun_cfg(kcfg)
     _graph_io_names(f)              # raises if no Input / Output ops
-    impl = _run_full_pipeline(f, kcfg)
+    impl, _global_unreliable = _run_full_pipeline(f, kcfg)
     mod, _externals = _translate_partitions(impl, kcfg, ccfg)
     return mod
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 0d67716..daf420b 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -29,6 +29,7 @@
 from KunQuant.Op import (
     OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
     WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
+    WindowedTrait,
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
@@ -341,6 +342,13 @@ def translate_function(f: Function, target: TargetSpec,
         if isinstance(op, Input):
             continue                      # already mapped from func_args
         if isinstance(op, Output):
+            # An Output may also be read as a windowed source within the
+            # same partition; emit a kunir.output_ref so downstream sees
+            # its gmem buffer as a ts handle.
+            if any(isinstance(u, WindowedTrait)
+                    for u in f.op_to_id[op].uses):
+                val_map[op] = ir.output_ref(op.attrs["name"],
+                                              val_map[op.inputs[0]])
             continue                      # handled at the end via Return
         if isinstance(op, ForeachBackWindow):
             _emit_loop(op, ir, val_map, ts_1,
diff --git a/KunQuant/passes/ExperimentalExpand.py b/KunQuant/passes/ExperimentalExpand.py
index 782318f..1874d32 100644
--- a/KunQuant/passes/ExperimentalExpand.py
+++ b/KunQuant/passes/ExperimentalExpand.py
@@ -110,9 +110,10 @@ def _expand_linreg(op: WindowedLinearRegression) -> List[OpBase]:
     v      = op.inputs[0]
 
     # sum_y = rolling sum of v over the window; NaN until window full.
-    sum_y  = FastWindowedSum(v, window)
+    # FastWindowedSum requires a WindowedDataSourceOp input sized window+1.
+    sum_y  = FastWindowedSum(WindowedTempOutput(v, window + 1), window)
     # sum_yy = rolling sum of v² — same pattern over a v*v intermediate.
-    sum_yy = FastWindowedSum(v * v, window)
+    sum_yy = FastWindowedSum(WindowedTempOutput(v * v, window + 1), window)
     # sum_xy = Σ idx * v where idx is the window position (0=oldest,
     # window-1=newest).  Express via FBW + WindowLoopIndex + Mul +
     # ReduceAdd; OOB reads (warmup) return NaN, so sum_xy is NaN until
diff --git a/KunQuant/passes/TempWindowElim.py b/KunQuant/passes/TempWindowElim.py
index 7646e21..324fc93 100644
--- a/KunQuant/passes/TempWindowElim.py
+++ b/KunQuant/passes/TempWindowElim.py
@@ -37,7 +37,24 @@ def for_each_op(op: OpBase, f: Function, replace_map: dict) -> Tuple[OpBase, OpB
         return (None, traverse_replace_map(max_window_op, replace_map))
     return (op, None)
 
+def _unwrap_output_wto(ops: List[OpBase], f: Function) -> bool:
+    """Rewrite Output(WindowedTempOutput(x)) → Output(x)."""
+    changed = False
+    for op in ops:
+        if not isinstance(op, Output):
+            continue
+        src = op.inputs[0]
+        if not isinstance(src, WindowedTempOutput):
+            continue
+        while isinstance(src, WindowedTempOutput):
+            src = src.inputs[0]
+        f.op_to_id[src].uses[op] = 1
+        op.inputs[0] = src
+        changed = True
+    return changed
+
 def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
+    _unwrap_output_wto(ops, f)
     replace_map = dict()
     out = []
     changed = False
diff --git a/cpp/Kun/Ops.hpp b/cpp/Kun/Ops.hpp
index 457a2da..ac802cb 100644
--- a/cpp/Kun/Ops.hpp
+++ b/cpp/Kun/Ops.hpp
@@ -301,7 +301,12 @@ template <typename T, int stride>
 struct Accumulator {
     using simd_t = kun_simd::vec<T, stride>;
     using float_mask_t = typename simd_t::Masktype;
-    simd_t v = 0;
+    simd_t v;
+    // Default-init to 0 for backward compat with existing Accumulator()
+    // call sites; the codegen emits Accumulator{init_val} for non-zero
+    // inits and the brace-init binds to this constructor.
+    Accumulator() : v(0) {}
+    Accumulator(T init) : v(init) {}
     struct Value {
         simd_t v;
         Accumulator& acc;
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index 8f55628..eb8eee7 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -35,6 +35,7 @@
 #pragma once
 
 #include <cstdint>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -124,6 +125,13 @@ struct ExecutableData {
   std::vector<KernelMeta> kernels;  ///< unordered set; runtime topo-sorts
   std::vector<std::string> graphInputs;
   std::vector<std::string> graphOutputs;
+  /// Per-graph-output warmup depth.  Walks the full dependency chain
+  /// (across partitions).  Used by the user-facing
+  /// `Executable::getOutputUnreliableCount` to tell callers how many
+  /// leading time steps of each Output buffer to skip.  Populated by
+  /// the Python frontend (which has the pre-partition `infer_window`
+  /// snapshot); empty when not supplied.
+  std::map<std::string, int64_t> outputUnreliable;
 };
 
 //===----------------------------------------------------------------------===//
@@ -168,6 +176,9 @@ class Executable {
   int64_t vectorSize()  const noexcept { return data_.vectorSize; }
   Datatype dtype()      const noexcept { return data_.dtype; }
   size_t  numKernels()  const noexcept { return data_.kernels.size(); }
+  const std::map<std::string, int64_t> &outputUnreliable() const noexcept {
+    return data_.outputUnreliable;
+  }
 
   // ── Accessors (runtime-resolved plan) ─────────────────────────────
   // Defined out-of-line so the header doesn't need GraphPlan's layout.
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 44ea165..3a57405 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -250,6 +250,25 @@ def KunIr_SetAccumulatorOp : KunIr_Op<"set_accumulator"> {
 // with a specific maxLookback.  The element types must match.
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// OutputRef op
+//
+// Expose a graph-output buffer as a ts handle so the same kernel can
+// both write to it and read trailing values from it.  `$value` is the
+// scalar to write at the current step.  Lowering hoists the write to
+// the op site so reads at offset 0 see the just-written scalar.  At
+// most one output_ref per name.
+//===----------------------------------------------------------------------===//
+
+def KunIr_OutputRefOp : KunIr_Op<"output_ref", [Pure]> {
+  let summary = "ts handle to a graph-output buffer";
+  let arguments = (ins StrAttr:$name, KunIr_AnyTs:$value);
+  let results   = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$name `,` $value `:` type($value) `->` type($result) attr-dict";
+}
+
 def KunIr_WindowedOutputOp : KunIr_Op<"windowed_output", [Pure]> {
   let summary = "Store a segment of a time-series stream with fixed lookback";
   let description = [{
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 9bc7c8a..a713176 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -128,6 +128,23 @@ LogicalResult SelectOp::inferReturnTypes(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// OutputRefOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult OutputRefOp::verify() {
+  auto valTy = llvm::cast<TsType>(getValue().getType());
+  auto resTy = llvm::cast<TsType>(getResult().getType());
+  if (valTy.getElementType() != resTy.getElementType())
+    return emitOpError("result element type '")
+           << resTy.getElementType()
+           << "' must match value element type '"
+           << valTy.getElementType() << "'";
+  if (getName().empty())
+    return emitOpError("output name must be non-empty");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // WindowedOutputOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index 339d35d..f0e9d4f 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -30,6 +30,9 @@
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+
+#include <limits>
 
 using namespace mlir;
 using namespace kunir;
@@ -384,10 +387,16 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
     if (isa<TsType>(ty)) tsRetIdx.push_back(i);
 
+  // Output buffer args are dense streams.  Type them as `ts<T, inf>` so
+  // a kunir.output_ref can expose them as windowed read sources without
+  // tripping windowed-input verifiers.
   SmallVector<Value> outParams;
   for (unsigned i : tsRetIdx) {
-    outParams.push_back(entry.addArgument(oldFT.getResult(i), loc));
-    newArgTys.push_back(oldFT.getResult(i));
+    auto origTy = llvm::cast<TsType>(oldFT.getResult(i));
+    auto infTy  = TsType::get(ctx, origTy.getElementType(),
+                                 std::numeric_limits<uint64_t>::max());
+    outParams.push_back(entry.addArgument(infTy, loc));
+    newArgTys.push_back(infTy);
   }
   SmallVector<Type> newRetTys;
   for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
@@ -395,6 +404,17 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
   funcOp.setFunctionTypeAttr(
       TypeAttr::get(FunctionType::get(ctx, newArgTys, newRetTys)));
 
+  // For each ts output: false = pending normal return-time write,
+  // true = already written by an output_ref (skip at return time).
+  // A missing entry means the name isn't a ts output of this func.
+  llvm::StringMap<bool> outNameToIsTakenOver;
+  if (auto outNamesAttr = funcOp.getOutputNames()) {
+    for (unsigned i : tsRetIdx) {
+      auto name = llvm::cast<StringAttr>(outNamesAttr[i]).getValue();
+      outNameToIsTakenOver[name] = false;
+    }
+  }
+
 
   // ------------------------------------------------------------------
   // 2. Snapshot original ops and find the original return.
@@ -689,6 +709,36 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       return success();
     }
 
+    // output_ref → hoist the write here and register the output buffer
+    // as a ts handle so downstream reads use the same buffer.
+    if (auto oref = dyn_cast<kunir::OutputRefOp>(op)) {
+      auto name = oref.getName();
+      auto it = outNameToIsTakenOver.find(name);
+      if (it == outNameToIsTakenOver.end())
+        return op.emitError("kunir-to-kungpu: output_ref references "
+                            "unknown output '") << name << "'";
+      if (it->second)
+        return op.emitError("kunir-to-kungpu: duplicate output_ref for "
+                            "output '") << name << "'";
+      it->second = true;
+      // Resolve the matching gpu.func output arg by parallel scan over
+      // (output names, outParams).  Both arrays have one entry per ts
+      // output and follow `tsRetIdx` order.
+      auto outNamesAttr = funcOp.getOutputNames();
+      Value buf;
+      for (auto [k, i] : llvm::enumerate(tsRetIdx)) {
+        if (llvm::cast<StringAttr>(outNamesAttr[i]).getValue() == name) {
+          buf = outParams[k];
+          break;
+        }
+      }
+      KUN_ASSIGN_OR_FAIL(Value valueScalar,
+                         outer.getScalar(oref.getValue(), fb, ol));
+      TsPutOp::create(fb, ol, buf, valueScalar);
+      outer.tsMap[oref.getResult()] = buf;
+      return success();
+    }
+
     // fast_windowed_sum → preserved as a kunir op with scalar result and
     // ts-handle input.  The kungpu-to-llvm pass owns the actual lowering
     // (per-thread state allocas + the Kahan-corrected step).
@@ -713,8 +763,16 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
 
   // ------------------------------------------------------------------
   // 6. Emit ts.put for each ts return value, then close the outer for.
+  //    Outputs already written by an output_ref are skipped.
   // ------------------------------------------------------------------
-  for (auto [outParam, rv] : llvm::zip(outParams, tsRetVals)) {
+  auto outNamesAttr = funcOp.getOutputNames();
+  for (auto [k, pair] : llvm::enumerate(llvm::zip(outParams, tsRetVals))) {
+    auto [outParam, rv] = pair;
+    if (outNamesAttr) {
+      auto name = llvm::cast<StringAttr>(outNamesAttr[tsRetIdx[k]]).getValue();
+      auto it = outNameToIsTakenOver.find(name);
+      if (it != outNameToIsTakenOver.end() && it->second) continue;
+    }
     auto it = outer.scalarMap.find(rv);
     assert(it != outer.scalarMap.end() &&
            "ts return value not materialised as a scalar");
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
index 8965d22..dcd4651 100644
--- a/mlir/lib/Python/IRBuilder.cpp
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -3,6 +3,8 @@
 #include "IRBuilder.h"
 #include "PyModule.h"
 
+#include <limits>
+
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/unique_ptr.h>
 #include <nanobind/stl/vector.h>
@@ -186,6 +188,15 @@ class IRBuilder {
         b_, b_.getUnknownLoc(), acc.getType(), acc, mask, value);
   }
 
+  // ── Graph-output buffer as ts handle ─────────────────────────────
+  Value outputRefOp(std::string name, Value value) {
+    auto vTs = llvm::cast<kunir::TsType>(value.getType());
+    auto resTy = kunir::TsType::get(pm_->ctx.get(), vTs.getElementType(),
+                                       std::numeric_limits<uint64_t>::max());
+    return kunir::OutputRefOp::create(b_, b_.getUnknownLoc(), resTy,
+                                          b_.getStringAttr(name), value);
+  }
+
   // ── Windowed buffer materialization ───────────────────────────────
   Value windowedOutputOp(Value x, int64_t length) {
     auto inTs = llvm::cast<kunir::TsType>(x.getType());
@@ -407,6 +418,11 @@ void registerIRBuilder(nb::module_ &m) {
       .def("windowed_output", &IRBuilder::windowedOutputOp,
             nb::arg("x"), nb::arg("length"))
 
+      .def("output_ref", &IRBuilder::outputRefOp,
+            nb::arg("name"), nb::arg("value"),
+            "ts handle to a graph-output buffer.  Downstream reads use "
+            "the same buffer the kernel writes `value` into.")
+
       // Back-reference + Fast windowed sum
       .def("back_ref",          &IRBuilder::backRefOp,
             nb::arg("x"), nb::arg("window"))
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 16e37bb..e1b8067 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -16,6 +16,7 @@
 
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>
+#include <nanobind/stl/map.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/vector.h>
 #include <nanobind/stl/unique_ptr.h>
@@ -393,7 +394,8 @@ pyCompile(PyModule &pm,
             const std::string &targetFeatures, unsigned optLevel,
             const std::string &toolkitPath,
             nb::list externalKernels,
-            int warpsPerCta) {
+            int warpsPerCta,
+            nb::dict outputUnreliable) {
   if (graphInputs.empty())
     throw std::runtime_error(
         "KunMLIR.compile: graph_inputs cannot be empty");
@@ -443,6 +445,11 @@ pyCompile(PyModule &pm,
   // handing off to Executable's ctor (which validates + plans).
   data.graphInputs  = graphInputs;
   data.graphOutputs = graphOutputs;
+  for (auto item : outputUnreliable) {
+    auto name = nb::cast<std::string>(item.first);
+    auto val  = nb::cast<int64_t>(item.second);
+    data.outputUnreliable[name] = val;
+  }
   return std::make_unique<kun_cuda::Executable>(std::move(data));
 }
 
@@ -510,7 +517,12 @@ NB_MODULE(KunMLIR, m) {
             [](const kun_cuda::Executable &e) {
               const auto &b = e.data().cubin;
               return nb::bytes(b.data(), b.size());
-            });
+            })
+      .def("getOutputUnreliableCount",
+            &kun_cuda::Executable::outputUnreliable,
+            nb::rv_policy::reference_internal,
+            "Return {output_name: unreliable_count} — leading time steps "
+            "of each graph output to drop.");
 
   // ── Executor ────────────────────────────────────────────────────────
   // Mirrors the CPU `kun::Executor` shape: an opaque object that wraps a
@@ -645,6 +657,7 @@ NB_MODULE(KunMLIR, m) {
          nb::arg("toolkit_path")   = "",
          nb::arg("external_kernels") = nb::list(),
          nb::arg("warps_per_cta")    = 0,
+         nb::arg("output_unreliable") = nb::dict(),
          "Compile a kunir module all the way to a loaded Executable.\n"
          "\n"
          "Pipeline: kunir → LLVM dialect → upstream `gpu-module-to-binary`\n"
diff --git a/tests/test.py b/tests/test.py
index f545f47..2dc4480 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -166,7 +166,52 @@ def check_tempwindow_elim():
 v5 = ForeachBackWindow@{window:10}(v2)
 v6 = ReduceAdd@(v5)
 v7 = Output@{name:}(v4)
-v8 = Output@{name:}(v6)''')        
+v8 = Output@{name:}(v6)''')
+
+    # case 4, Output wraps a WindowedTempOutput directly.  The pre-pass
+    # peels it off; the resulting Output(Mul) then triggers the existing
+    # WindowedTempOutput → Output fold, leaving the loop reading the
+    # Output as a windowed source.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 10)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 10))
+        Output(wto, "xport")
+        Output(v1, "reduced")
+    f = Function(builder.ops)
+    temp_window_elim(f)
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = Output@{name:xport}(v1)
+v3 = ForeachBackWindow@{window:10}(v2)
+v4 = ReduceAdd@(v3)
+v5 = Output@{name:reduced}(v4)''')
+
+    # case 5, a WindowedTempOutput shared by both an Output and multiple
+    # windowed consumers in the same function.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 31)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 30))
+        v2 = ReduceAdd(ForeachBackWindow(wto, 20))
+        Output(wto, "xport")
+        Output(v1, "r30")
+        Output(v2, "r20")
+    f = Function(builder.ops)
+    temp_window_elim(f)
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = Output@{name:xport}(v1)
+v3 = ForeachBackWindow@{window:30}(v2)
+v4 = ReduceAdd@(v3)
+v5 = ForeachBackWindow@{window:20}(v2)
+v6 = ReduceAdd@(v5)
+v7 = Output@{name:r30}(v4)
+v8 = Output@{name:r20}(v6)''')
 
 def check_window():
     # case 1, temp window on input
diff --git a/tests/test_alpha158.py b/tests/test_alpha158.py
index 943df2a..f5715f4 100644
--- a/tests/test_alpha158.py
+++ b/tests/test_alpha158.py
@@ -14,7 +14,36 @@
 
 isx86 = cpu_arch != "aarch64"
 
-def check_alpha158(avx512, keep, tempdir):
+
+# Factor families the GPU backend can't compile yet (the underlying op
+# has no kunir lowering).  We filter their Output ops out of the Function
+# before compileit on the GPU path — the rest of alpha158 compiles fine.
+#   QTLU / QTLD → WindowedQuantile → SkipList (CPU-only).
+#   CORR / CORD → SetInfOrNanToValue wrapper not in CodegenMLIR's dispatch.
+_GPU_SKIP_FACTOR_PREFIXES = ("QTLU", "QTLD", "CORR", "CORD")
+
+
+def _filter_outputs_for_gpu(f: Function) -> None:
+    """Remove Output ops whose name starts with a `_GPU_SKIP_FACTOR_PREFIXES`
+    entry.  Mutates `f` in place via `set_ops`.  The dropped intermediate
+    compute ops are GC'd as part of the downstream optimization pipeline
+    (anything with no remaining user is dead)."""
+    kept = []
+    dropped = []
+    for op in f.ops:
+        if isinstance(op, Output):
+            name = op.attrs.get("name", "")
+            if any(name.startswith(p) for p in _GPU_SKIP_FACTOR_PREFIXES):
+                dropped.append(name)
+                continue
+        kept.append(op)
+    if dropped:
+        print(f"[gpu] dropping {len(dropped)} unsupported outputs: "
+              f"{sorted(set(n.rstrip('0123456789') for n in dropped))}")
+    f.set_ops(kept)
+
+
+def check_alpha158(avx512, keep, tempdir, gpu_arch=""):
     builder = Builder()
     with builder:
         pack_158 = AllData(low=Input("low"), high=Input("high"), close=Input(
@@ -38,6 +67,17 @@ def check_alpha158(avx512, keep, tempdir):
             Output(v, k)
     print("Total names: ", len(names))
     f = Function(builder.ops)
+    if gpu_arch:
+        _filter_outputs_for_gpu(f)
+        from KunQuant.jit import cuda as _cuda_jit
+        target = [("alpha158", f, KunCompilerConfig(
+            dtype='double', blocking_len=1, partition_factor=2,
+            output_layout="TS", input_layout="TS",
+            options={"opt_reduce": True, "fast_log": True,
+                     'no_fast_stat': 'no_warn'}))]
+        ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=gpu_arch)
+        return _cuda_jit.compileit(target, "testalpha158", ccfg)
+
     if avx512:
         simd_len = 8
     elif isx86:
@@ -45,7 +85,8 @@ def check_alpha158(avx512, keep, tempdir):
     else:
         simd_len = 2
     target = [("alpha158", f, KunCompilerConfig(dtype='double', blocking_len=simd_len, partition_factor=4,
-               output_layout="TS", input_layout="TS", options={"opt_reduce": True, "fast_log": True}))]
+               output_layout="TS", input_layout="TS", options={"opt_reduce": True, "fast_log": True,
+                                                                'no_fast_stat': 'no_warn'}))]
     if avx512:
         machine = cfake.X64CPUFlags(avx512=True, avx512dq=True, avx512vl=True)
     else:
@@ -65,28 +106,68 @@ def ST_TS(data: np.ndarray) -> np.ndarray:
     return np.ascontiguousarray(data.transpose()).astype('float64')
 
 
-def test(lib: kr.Library, inputs: Dict[str, np.ndarray], ref: Dict[str, np.ndarray]):
-    rtol = 1e-4
-    atol = 1e-5
-    modu = lib.getModule("alpha158")
-    start_window = modu.getOutputUnreliableCount()
-    num_stock = 8
-    num_time = 260
-    outnames = modu.getOutputNames()
-    print("Total num alphas", len(outnames))
-    executor = kr.createMultiThreadExecutor(8)
-    my_input = {"high": ST_TS(inputs['dhigh']), "low": ST_TS(inputs['dlow']), "close": ST_TS(inputs['dclose']),
-                "open": ST_TS(inputs['dopen']), "volume": ST_TS(inputs['dvol']), "amount": ST_TS(inputs['damount'])}
-    outbuffers = dict()
-    # Factors, Time, Stock
-    sharedbuf = np.empty((len(outnames), num_time, num_stock), dtype="float64")
-    sharedbuf[:] = np.nan
-    for idx, name in enumerate(outnames):
-        outbuffers[name] = sharedbuf[idx]
-    start = time.time()
-    out = kr.runGraph(executor, modu, my_input, 0, num_time, outbuffers)
-    end = time.time()
-    print(f"Exec takes: {end-start:.6f} seconds")
+# ── Backend shims ───────────────────────────────────────────────────
+#
+# CPU and GPU have the same conceptual flow (prepare → execute → fetch),
+# they differ only in the runtime calls and where the buffers live.
+# Wrap each backend in a tiny object exposing the three methods so the
+# `test()` body stays single-source.
+
+class _CpuBackend:
+    def __init__(self, lib: kr.Library, modname: str):
+        self.modu = lib.getModule(modname)
+        self.start_window = self.modu.getOutputUnreliableCount()
+        self.outnames = self.modu.getOutputNames()
+        self.executor = kr.createMultiThreadExecutor(8)
+        # Pre-allocated NaN-filled output buffers; the CPU runtime writes
+        # in place and we hand the same dict back to `_compare`.
+        self._outbuffers = {}
+        sharedbuf = np.empty((len(self.outnames), num_time, num_stock),
+                              dtype="float64")
+        sharedbuf[:] = np.nan
+        for idx, name in enumerate(self.outnames):
+            self._outbuffers[name] = sharedbuf[idx]
+
+    def prepare_input(self, host_input):
+        return host_input
+
+    def execute(self, inputs):
+        kr.runGraph(self.executor, self.modu, inputs, 0, num_time,
+                     self._outbuffers)
+
+    def fetch_output(self):
+        return self._outbuffers
+
+
+class _GpuBackend:
+    def __init__(self, lib, modname: str):
+        import cupy as cp
+        from KunQuant.jit import KunMLIR as _kr_mlir
+        self._cp = cp
+        self.modu = lib.getModule(modname)
+        self.start_window = self.modu.getOutputUnreliableCount()
+        self.outnames = self.modu.output_names
+        self.executor = _kr_mlir.Executor()
+        self._raw = None
+
+    def prepare_input(self, host_input):
+        return {k: self._cp.asarray(v) for k, v in host_input.items()}
+
+    def execute(self, inputs):
+        self._raw = self.executor.runGraph(self.modu, inputs)
+        self.executor.synchronize()
+
+    def fetch_output(self):
+        cp = self._cp
+        out = {}
+        for k in self.outnames:
+            v = self._raw[k]
+            arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+            out[k] = cp.asnumpy(arr)
+        return out
+
+
+def _compare(outbuffers, ref, start_window, rtol, atol):
     for k, v in outbuffers.items():
         s = start_window[k]
         if not np.allclose(v[s:], ref[k][s:], rtol=rtol, atol=atol, equal_nan=True):
@@ -102,6 +183,22 @@ def test(lib: kr.Library, inputs: Dict[str, np.ndarray], ref: Dict[str, np.ndarr
                     exit(1)
 
 
+def test(backend, inputs: Dict[str, np.ndarray],
+          ref: Dict[str, np.ndarray]) -> None:
+    rtol = 1e-4
+    atol = 1e-5
+    print("Total num alphas", len(backend.outnames))
+    host_input = {"high": ST_TS(inputs['dhigh']), "low": ST_TS(inputs['dlow']),
+                  "close": ST_TS(inputs['dclose']), "open": ST_TS(inputs['dopen']),
+                  "volume": ST_TS(inputs['dvol']), "amount": ST_TS(inputs['damount'])}
+    be_input = backend.prepare_input(host_input)
+    start = time.time()
+    backend.execute(be_input)
+    end = time.time()
+    print(f"Exec takes: {end-start:.6f} seconds")
+    _compare(backend.fetch_output(), ref, backend.start_window, rtol, atol)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="Run and check alpha158 again pre-computed result")
@@ -110,15 +207,28 @@ def test(lib: kr.Library, inputs: Dict[str, np.ndarray], ref: Dict[str, np.ndarr
     parser.add_argument("--ref", required=True, type=str,
                         help="The path to the reference output npz file")
     parser.add_argument("--action", required=True, type=str,
-                        help="The path to the reference output npz file")
+                        help="One of: compile_avx512, run_avx512, run_native, run_gpu")
+    parser.add_argument("--gpu-arch", default="sm_80", type=str,
+                        help="GPU compute capability for --action=run_gpu (e.g. sm_80)")
     args = parser.parse_args()
     if args.action == "compile_avx512":
         check_alpha158(True, True, "./build")
         exit(0)
     elif args.action == "run_avx512":
         lib = kr.Library.load(os.path.join("./build/testalpha158", "testalpha158.so"))
+        inp, ref = load(args.inputs, args.ref)
+        test(_CpuBackend(lib, "alpha158"), inp, ref)
+    elif args.action == "run_gpu":
+        # Touch the cupy allocator before compileit so the primary CUDA
+        # context exists when KunMLIR.compile inherits it.
+        import cupy as cp
+        cp.cuda.Device(0).use()
+        cp.zeros((1,), dtype=cp.float64)
+        lib = check_alpha158(False, False, None, gpu_arch=args.gpu_arch)
+        inp, ref = load(args.inputs, args.ref)
+        test(_GpuBackend(lib, "alpha158"), inp, ref)
     else:
         lib = check_alpha158(False, False, None)
-    inp, ref = load(args.inputs, args.ref)
-    test(lib, inp, ref)
+        inp, ref = load(args.inputs, args.ref)
+        test(_CpuBackend(lib, "alpha158"), inp, ref)
     print("done")

From b73aa917fe5dce2a4f7570ffe7c0cdd3fb36dcf0 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 19 May 2026 01:42:56 -0700
Subject: [PATCH 38/59] optimize partitioner: avoid Input -> WindowedTempOutput
 -> Output partitioning

---
 KunQuant/passes/ExperimentalExpand.py | 27 ++++++++++++-
 KunQuant/passes/Partitioner.py        | 58 +++++++++++++++++++++++----
 mlir/include/KunIr/KunIrOps.td        |  2 +-
 mlir/lib/KunIr/KunIrOps.cpp           | 25 ++++++++++--
 mlir/lib/KunIr/KunIrToKunGpu.cpp      | 17 ++++++--
 tests/test2.py                        | 42 ++++++++++++++++++-
 tests/test_alpha158.py                |  3 +-
 7 files changed, 153 insertions(+), 21 deletions(-)

diff --git a/KunQuant/passes/ExperimentalExpand.py b/KunQuant/passes/ExperimentalExpand.py
index 1874d32..dac145f 100644
--- a/KunQuant/passes/ExperimentalExpand.py
+++ b/KunQuant/passes/ExperimentalExpand.py
@@ -22,6 +22,9 @@
   Intermediate ops are stashed in ``state[lin_op] : List[OpBase]`` so each
   consumer Impl (``Slope``, ``RSqaure``, ``Resi``) can pick the entries it
   needs and emit its final formula.
+
+* ``SetInfOrNanToValue(a, value)`` → ``Select(isnan(a - a), value, a)``
+  (mirrors the C++ implementation; ``a - a`` is NaN for both NaN and ±Inf).
 """
 
 from typing import Dict, List
@@ -30,7 +33,7 @@
     OpBase, Builder, ConstantOp, ForeachBackWindow, IterValue,
     WindowedTempOutput, WindowLoopIndex,
 )
-from KunQuant.ops.ElewiseOp import Select, Equals, Not
+from KunQuant.ops.ElewiseOp import Select, Equals, Not, SetInfOrNanToValue
 from KunQuant.ops.ReduceOp import ReduceAdd
 from KunQuant.ops.MiscOp import (
     FastWindowedSum, Accumulator, SetAccumulator,
@@ -173,6 +176,19 @@ def _expand_lr_resi(impl: WindowedLinearRegressionResiImpl,
     return state[_LR_V] - pred
 
 
+# ── SetInfOrNanToValue expansion ────────────────────────────────────
+
+def _expand_set_inf_or_nan(op: SetInfOrNanToValue) -> OpBase:
+    # Mirrors the C++ implementation in cpp/Kun/Ops.hpp:
+    # `mask = isnan(a - a); return select(mask, v, a)`.
+    # `a - a` is 0 for finite `a` and NaN for NaN/±Inf (Inf-Inf == NaN),
+    # so isnan-of-diff catches both NaN and Inf in one shot.
+    a = op.inputs[0]
+    diff = a - a
+    mask = Not(Equals(diff, diff))
+    return Select(mask, ConstantOp(op.attrs["value"]), a)
+
+
 # ── Pass driver ─────────────────────────────────────────────────────
 
 def _experimental_expand_impl(
@@ -242,6 +258,15 @@ def _experimental_expand_impl(
             changed = True
             continue
 
+        if isinstance(op, SetInfOrNanToValue):
+            b = Builder(op.get_parent())
+            with b:
+                new_val = _expand_set_inf_or_nan(op)
+            out.extend(b.ops)
+            replace_map[op] = new_val
+            changed = True
+            continue
+
         out.append(op)
 
     return out if changed else None
diff --git a/KunQuant/passes/Partitioner.py b/KunQuant/passes/Partitioner.py
index 7f5c82f..c6534cd 100644
--- a/KunQuant/passes/Partitioner.py
+++ b/KunQuant/passes/Partitioner.py
@@ -1,4 +1,4 @@
-from KunQuant.Op import OpBase, Output, Input, CrossSectionalOp, GraphSourceTrait, ConstantOp, ReductionOp, BoolOpTrait, GlobalStatefulProducerTrait, StateConsumerTrait
+from KunQuant.Op import OpBase, Output, Input, CrossSectionalOp, GraphSourceTrait, ConstantOp, ReductionOp, BoolOpTrait, GlobalStatefulProducerTrait, StateConsumerTrait, WindowedTempOutput
 from KunQuant.ops.MiscOp import ReturnFirstValue
 from KunQuant.Stage import Function, OpInfo
 from KunQuant.ops import GenericPartition
@@ -289,10 +289,34 @@ def add_to_naming_table(v: str) -> str:
                 # input is shared by all ops
                 assert(op not in op_lookup_table)
                 op_lookup_table[op] = p
+    # Map output-name → producer partition.  Tracks the partition that
+    # owns the Output op for each cross-partition / graph name.  Used by
+    # the WTO(Input) peel below to record the real upstream dependency
+    # after dereferencing the WTO wrapper.
+    name_to_output_partition: Dict[str, _Partition] = {}
+    for op, owner in op_lookup_table.items():
+        if isinstance(op, Output):
+            name_to_output_partition[op.attrs["name"]] = owner
     hash_cache: Dict['OpBase', int] = dict()
     for p in partitions:
         name_to_input = dict()
         depending : typing.OrderedDict[_Partition, None] = OrderedDict()
+
+        def get_local_input(out_name: str, prefer: OpBase = None) -> OpBase:
+            """Return p's local `Input(out_name)`, creating it if needed.
+            If `prefer` is given and already lives in `p.ops`, reuse it
+            instead of allocating a new Input."""
+            inop = name_to_input.get(out_name)
+            if inop is not None:
+                return inop
+            if prefer is not None and prefer in p.ops:
+                inop = prefer
+            else:
+                inop = Input(out_name)
+                p.add(None, inop)
+            name_to_input[out_name] = inop
+            return inop
+
         # for each op in partition
         for op in list(p.ops):
             for idx, inp in enumerate(op.inputs):
@@ -300,6 +324,29 @@ def add_to_naming_table(v: str) -> str:
                     # if the partition depends on an op of another partition
                     if inp.get_parent():
                         raise RuntimeError("Bad cross partition op: " + str(inp) + "\ncur op=" + str(op))
+                    # If the input of an op is a WindowedTempOutput wrapping an partition Input, peel it off.
+                    # original: Op(WindowedTempOutput(Input("xxx")))
+                    # peeled: Op(Input("xxx"))
+                    # Note that the WindowedTempOutput should be in another partition,
+                    # which has been processed already in the parent loop `for p in partitions`.
+                    # Input("xxx") should be an input of that partition
+                    orig_inp = inp
+                    while isinstance(inp, WindowedTempOutput) and \
+                            isinstance(inp.inputs[0], Input):
+                        inp = inp.inputs[0]
+                    # if Op(WindowedTempOutput(Input("xxx"))) pattern is found ...
+                    if inp is not orig_inp:
+                        # orig_inp is the WindowedTempOutput
+                        orig_info = f.op_to_id[orig_inp]
+                        if op in orig_info.uses:
+                            del orig_info.uses[op]
+                        # inp is the Input
+                        out_name = inp.attrs["name"]
+                        producer = name_to_output_partition.get(out_name)
+                        if producer is not None and producer != p:
+                            depending[producer] = None
+                        op.inputs[idx] = get_local_input(out_name, prefer=inp)
+                        continue
                     inp_info = f.op_to_id[inp]
                     if isinstance(inp, ConstantOp):
                         if op in inp_info.uses:
@@ -318,6 +365,7 @@ def add_to_naming_table(v: str) -> str:
                             inp_partition = op_lookup_table[inp]
                             inp_partition.add(None, outop)
                             op_lookup_table[outop] = inp_partition
+                            name_to_output_partition[out_name] = inp_partition
                         else:
                             out_name = outop.attrs["name"]
                             inp_partition = op_lookup_table[outop]
@@ -327,13 +375,7 @@ def add_to_naming_table(v: str) -> str:
                         out_name = inp.attrs["name"]
                     if op in inp_info.uses:
                         del inp_info.uses[op]
-                    
-                    inop = name_to_input.get(out_name, None)
-                    if not inop:
-                        inop = Input(out_name)
-                        p.add(None, inop)
-                        name_to_input[out_name] = inop
-                    op.inputs[idx] = inop
+                    op.inputs[idx] = get_local_input(out_name)
         p.depending = depending
         p.stage_op = GenericPartition([], None)
     
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
index 3a57405..7739cde 100644
--- a/mlir/include/KunIr/KunIrOps.td
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -403,7 +403,7 @@ def KunIr_ReduceArgMaxOp : KunIr_Op<"reduce_argmax",
   let assemblyFormat = "$value `:` type($value) attr-dict";
 }
 def KunIr_ReduceRankOp : KunIr_Op<"reduce_rank",
-    [Pure, SameOperandsAndResultType]> {
+    [Pure, AllTypesMatch<["value", "result"]>]> {
   let summary = "Per-window cross-sectional rank of `current` against the "
                   "iterated window values: less_count + (eq_count + 1) / 2.";
   let arguments = (ins KunIr_AnyTs:$value, KunIr_AnyTs:$current);
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index a713176..61b7ce7 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -191,7 +191,14 @@ LogicalResult ReduceMaxOp::verify() { return verifyInsideForEachBackWindow(*this
 LogicalResult ReduceMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceArgMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
 LogicalResult ReduceArgMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
-LogicalResult ReduceRankOp::verify()   { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceRankOp::verify() {
+  if (failed(verifyInsideForEachBackWindow(*this))) return failure();
+  auto vT = llvm::cast<TsType>(getValue().getType());
+  auto cT = llvm::cast<TsType>(getCurrent().getType());
+  if (cT.getElementType() != vT.getElementType())
+    return emitOpError("current element type must match value element type");
+  return success();
+}
 LogicalResult WindowLoopIndexOp::verify() {
   return verifyInsideForEachBackWindow(*this);
 }
@@ -879,9 +886,19 @@ LogicalResult ReturnOp::verify() {
 
   for (auto [i, opType, resType] :
        llvm::enumerate(getOperandTypes(), resultTypes)) {
-    if (opType != resType)
-      return emitOpError("operand #") << i << " type '" << opType
-             << "' does not match function result type '" << resType << "'";
+    if (opType == resType) continue;
+    // ts<T, inf> operand → ts<T, 1> result is allowed for graph-output
+    // passes through a ts handle (function arg / output_ref).  The
+    // lowering scalarizes via ts.get @ offset 0 before ts.put.
+    auto opTs  = llvm::dyn_cast<TsType>(opType);
+    auto resTs = llvm::dyn_cast<TsType>(resType);
+    if (opTs && resTs &&
+        opTs.getElementType() == resTs.getElementType() &&
+        opTs.getMaxLookback() == kInfLookback &&
+        resTs.getMaxLookback() == 1)
+      continue;
+    return emitOpError("operand #") << i << " type '" << opType
+           << "' does not match function result type '" << resType << "'";
   }
   return success();
 }
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
index f0e9d4f..e669684 100644
--- a/mlir/lib/KunIr/KunIrToKunGpu.cpp
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -773,10 +773,19 @@ void LowerKunIrToKunGpuPass::runOnOperation() {
       auto it = outNameToIsTakenOver.find(name);
       if (it != outNameToIsTakenOver.end() && it->second) continue;
     }
-    auto it = outer.scalarMap.find(rv);
-    assert(it != outer.scalarMap.end() &&
-           "ts return value not materialised as a scalar");
-    TsPutOp::create(fb, loc, outParam, it->second);
+    // Most return values are scalars produced inside the time loop.
+    // A ts<T, inf> operand (graph-input passthrough) is also accepted;
+    // resolve it to a scalar via ts.get @ 0.
+    Value scalarVal;
+    auto sit = outer.scalarMap.find(rv);
+    if (sit != outer.scalarMap.end()) {
+      scalarVal = sit->second;
+    } else {
+      auto res = outer.getScalar(rv, fb, loc);
+      if (failed(res)) return signalPassFailure();
+      scalarVal = *res;
+    }
+    TsPutOp::create(fb, loc, outParam, scalarVal);
   }
   scf::YieldOp::create(fb, loc);
 
diff --git a/tests/test2.py b/tests/test2.py
index 24816da..599f1b3 100644
--- a/tests/test2.py
+++ b/tests/test2.py
@@ -3,6 +3,7 @@
 from KunQuant.ops import *
 import KunQuant.passes
 from KunQuant.passes import *
+from KunQuant.Driver import post_optimize
 
 def optimize(f: Function):
     decompose(f)
@@ -115,6 +116,45 @@ def test_partition_rank_out():
 v3 = Output@{name:out2}(v2)''']
     check_partition(f, exp1, exp2)
 
+def test_partition_wto_input_peel():
+    # WTO whose underlying value gets pulled cross-partition.  Many
+    # AddConst-Output pairs split the producing partition off from the
+    # FBS consumers; without the WTO(Input) peel in the partitioner, the
+    # FBS-side partition rewires WTO.inputs[0] to a local synthetic
+    # Input and post-partition `temp_window_elim` folds WTO(Input) →
+    # Input, leaving a degenerate `Output(Input)` passthrough.
+    # original IR:
+    # partition 1:
+    #   a = Input("xxx")  # partition temp input
+    #   b = WindowedTempOutput(a)
+    #   c = use(b)
+    # partition 2:
+    #   d = use(b)   # cross partition op
+    # if without peeling, partition 2 will import WindowedTempOutput as cross partition op.
+    # So WindowedTempOutput will be wired to an output op of partition 1. This is bad for performance.
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        b = Input("b")
+        x = Mul(a, b)
+        for i in range(8):
+            Output(AddConst(x, float(i)), f"add_{i}")
+        wt = WindowedTempOutput(x, 30)
+        for i in range(5):
+            Output(FastWindowedSum(wt, 5 + i * 4), f"fbs_{i}")
+    f = Function(builder.ops)
+    optimize(f)
+    _, impl = do_partition(f, 1)
+    post_optimize(impl, {})
+    for sub in impl:
+        for op in sub.ops:
+            if isinstance(op, Output) and isinstance(op.inputs[0], Input):
+                raise RuntimeError(
+                    f"partitioner left Output(Input) passthrough in "
+                    f"partition {sub.name!r}: {op}")
+
+
 test_partition1()
 test_partition_cylic()
-test_partition_rank_out()
\ No newline at end of file
+test_partition_rank_out()
+test_partition_wto_input_peel()
\ No newline at end of file
diff --git a/tests/test_alpha158.py b/tests/test_alpha158.py
index f5715f4..0a69470 100644
--- a/tests/test_alpha158.py
+++ b/tests/test_alpha158.py
@@ -19,8 +19,7 @@
 # has no kunir lowering).  We filter their Output ops out of the Function
 # before compileit on the GPU path — the rest of alpha158 compiles fine.
 #   QTLU / QTLD → WindowedQuantile → SkipList (CPU-only).
-#   CORR / CORD → SetInfOrNanToValue wrapper not in CodegenMLIR's dispatch.
-_GPU_SKIP_FACTOR_PREFIXES = ("QTLU", "QTLD", "CORR", "CORD")
+_GPU_SKIP_FACTOR_PREFIXES = ("QTLU", "QTLD")
 
 
 def _filter_outputs_for_gpu(f: Function) -> None:

From 9eef8c1577bb0fb19cac448133d5963b90f9fc34 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 19 May 2026 01:48:00 -0700
Subject: [PATCH 39/59] is_whole_time_required for talib

---
 KunQuant/predefined/talib.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/KunQuant/predefined/talib.py b/KunQuant/predefined/talib.py
index 688ec2b..b10869c 100644
--- a/KunQuant/predefined/talib.py
+++ b/KunQuant/predefined/talib.py
@@ -57,7 +57,7 @@ def decompose(self, options: dict) -> List[OpBase]:
             tr = TRANGE(high, low, close)
 
             mask_true = Equals(ConstantOp(0), ConstantOp(0))
-            cnt_acc = Accumulator(high, f"atr_cnt_{window}")
+            cnt_acc = Accumulator(high, f"atr_cnt_{window}", is_whole_time_required=True)
             prev_cnt = cnt_acc
             new_cnt = prev_cnt + 1
             set_cnt = SetAccumulator(cnt_acc, mask_true, new_cnt)
@@ -102,15 +102,15 @@ def decompose(self, options: dict) -> List[OpBase]:
 
             mask_true = Equals(ConstantOp(0), ConstantOp(0))
 
-            cnt_acc = Accumulator(high, "sar_cnt")
+            cnt_acc = Accumulator(high, "sar_cnt", is_whole_time_required=True)
             prev_cnt = cnt_acc
             set_cnt = SetAccumulator(cnt_acc, mask_true, prev_cnt + 1)
             is_bar_0 = Equals(prev_cnt, ConstantOp(0))
             is_bar_1 = Equals(prev_cnt, ConstantOp(1))
 
-            sar_acc = Accumulator(high, "sar_value")
-            ep_acc = Accumulator(high, "sar_ep")
-            af_acc = Accumulator(high, "sar_af")
+            sar_acc = Accumulator(high, "sar_value", is_whole_time_required=True)
+            ep_acc = Accumulator(high, "sar_ep", is_whole_time_required=True)
+            af_acc = Accumulator(high, "sar_af", is_whole_time_required=True)
 
             prev_sar = sar_acc
             prev_ep = ep_acc

From 50b1f5ed997acb2de5f5c9c5fd7d1cd9da3907b8 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 21 May 2026 02:11:18 -0700
Subject: [PATCH 40/59] alpha101, fix time slice temp_window/output (should not
 use output)

---
 .gitignore                            |   3 +-
 KunQuant/jit/cuda.py                  |   9 +-
 KunQuant/passes/CodegenMLIR.py        |  18 +--
 KunQuant/passes/ExperimentalExpand.py | 125 +++++++++++----------
 KunQuant/passes/TempWindowElim.py     |  30 +++--
 doc/Customize.md                      |   3 +-
 mlir/lib/Python/MlirBinding.cpp       |   5 +
 mlir/test/python/test_kun_to_cuda.py  |  74 +++++++++++++
 tests/KunTestUtil/ref_alpha101.py     |   5 +-
 tests/test.py                         |  73 +++++++++++-
 tests/test_alpha101.py                | 154 ++++++++++++++++++++++----
 11 files changed, 398 insertions(+), 101 deletions(-)

diff --git a/.gitignore b/.gitignore
index 06b9740..e0dcd8f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ KunQuant.egg-info/*
 dist/*
 *.pyd
 *.dll
-*.lib
\ No newline at end of file
+*.lib
+.codex
\ No newline at end of file
diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index 932618a..eda0d05 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -147,17 +147,22 @@ def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
 
     `blocking_len` is needed by some decompose paths (it's also the
     skip-list / naive cost-model knob).  `kcfg.options` flows through
-    verbatim — including `no_fast_stat`, `opt_reduce`, `fast_log`,
-    all of which the GPU lowering now supports.
+    first — including `no_fast_stat`, `opt_reduce`, `fast_log`, all of
+    which the GPU lowering now supports.
 
     `no_skip_list=True` is forced unconditionally and overrides any
     user-provided value: the kunir codegen has no lowering for
     `SkipList*` ops, so the naive `ForeachBackWindow + Reduce*` path
     is the only one that lowers on GPU.
+
+    `may_slice_time=True` is the safe GPU default because the runtime can
+    split a single graph launch into multiple time chunks.  Users who
+    guarantee single-chunk launches may explicitly set it to False.
     """
     opts: dict = {"blocking_len": _resolve_vector_size(kcfg)}
     if kcfg.options:
         opts.update(kcfg.options)
+    opts.setdefault("may_slice_time", True)
     opts["no_skip_list"] = True
     # Pipeline lowering doesn't know about ExpMovingAvg or the
     # WindowedLinearRegression* family — turn on the Accumulator-based
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index daf420b..e710e08 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -29,11 +29,12 @@
 from KunQuant.Op import (
     OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
     WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
-    WindowedTrait,
+    WindowedTrait, Rank,
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
     AddConst, SubConst, MulConst, DivConst,
+    GreaterThanConst, LessThanConst,
     GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
     And, Or, Not, Select,
 )
@@ -63,6 +64,7 @@
 # means `v - x`, where for plain SubConst it would mean `x - v`).
 _BINARY_CONST = {
     AddConst: "add", SubConst: "sub", MulConst: "mul", DivConst: "div",
+    GreaterThanConst: "gt", LessThanConst: "lt",
 }
 _UNARY = {
     Abs: "abs", Log: "log", Exp: "exp", Sqrt: "sqrt", Sign: "sign",
@@ -247,20 +249,20 @@ def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
 
     Detection mirrors CodegenCpp's "simple cross-sectional fast path"
     (CodegenCpp.codegen_cpp's `len(f.ops) == 3` check): a partition
-    whose only compute op is a `SimpleCrossSectionalOp` (Rank, Scale,
-    …).  The partitioner places every CrossSectionalOp into its own
+    whose only compute op is a supported `SimpleCrossSectionalOp`
+    (currently Rank).  The partitioner places every CrossSectionalOp into its own
     partition without other compute, so this shape is what we get.
 
-    The `kind` string is `cs_<lowercased class name>_f{32,64}`, e.g.
-    `cs_rank_f32`, `cs_scale_f64`.  The C++ binding maps it to a
-    `KernelKind` enum; unknown kinds raise there with a clear error,
-    so adding a new SimpleCrossSectionalOp on the Python side does
-    not silently succeed without a matching bundled PTX kernel.
+    The `kind` string is `cs_rank_f{32,64}`.  Do not fabricate kinds for
+    cross-sectional ops unless the C++ runtime has a matching bundled
+    external kernel.
     """
     compute = [op for op in f.ops
                 if not isinstance(op, (Input, Output))]
     if len(compute) != 1 or not isinstance(compute[0], SimpleCrossSectionalOp):
         return None
+    if not isinstance(compute[0], Rank):
+        return None
     inputs  = [op for op in f.ops if isinstance(op, Input)]
     outputs = [op for op in f.ops if isinstance(op, Output)]
     if len(inputs) != 1 or len(outputs) != 1:
diff --git a/KunQuant/passes/ExperimentalExpand.py b/KunQuant/passes/ExperimentalExpand.py
index dac145f..f0af28d 100644
--- a/KunQuant/passes/ExperimentalExpand.py
+++ b/KunQuant/passes/ExperimentalExpand.py
@@ -25,16 +25,19 @@
 
 * ``SetInfOrNanToValue(a, value)`` → ``Select(isnan(a - a), value, a)``
   (mirrors the C++ implementation; ``a - a`` is NaN for both NaN and ±Inf).
+
+* ``ReduceDecayLinear(v, window)`` → ``ReduceAdd(v * weight)`` where
+  ``weight = (WindowLoopIndex + 1) / (window * (window + 1) / 2)``.
 """
 
-from typing import Dict, List
+from typing import Callable, Dict, List, Optional, Tuple, Type
 
 from KunQuant.Op import (
     OpBase, Builder, ConstantOp, ForeachBackWindow, IterValue,
     WindowedTempOutput, WindowLoopIndex,
 )
 from KunQuant.ops.ElewiseOp import Select, Equals, Not, SetInfOrNanToValue
-from KunQuant.ops.ReduceOp import ReduceAdd
+from KunQuant.ops.ReduceOp import ReduceAdd, ReduceDecayLinear
 from KunQuant.ops.MiscOp import (
     FastWindowedSum, Accumulator, SetAccumulator,
     ExpMovingAvg, WindowedLinearRegression,
@@ -150,30 +153,38 @@ def _expand_linreg(op: WindowedLinearRegression) -> List[OpBase]:
 
 # ── Consumer formulas (one per Impl op) ─────────────────────────────
 
-def _expand_lr_slope(impl: WindowedLinearRegressionSlopeImpl,
-                     state: List[OpBase]) -> OpBase:
-    return state[_LR_SLOPE]
+def _expand_lr_slope(op: OpBase,
+                     state: Dict[OpBase, List[OpBase]]) -> OpBase:
+    return state[op.inputs[0]][_LR_SLOPE]
 
 
-def _expand_lr_rsquare(impl: WindowedLinearRegressionRSqaureImpl,
-                       state: List[OpBase], window: int) -> OpBase:
+def _expand_lr_rsquare(op: OpBase,
+                       state: Dict[OpBase, List[OpBase]]) -> OpBase:
+    lin_op = op.inputs[0]
+    lr_state = state[lin_op]
     # SS_reg = slope² * (window*sum_xx - sum_x²) / window = slope² * denom / window
     # SS_tot = sum_yy - sum_y²/window
     # R²     = SS_reg / SS_tot
-    n     = float(window)
+    n     = float(lin_op.attrs["window"])
     denom = (n * n) * (n - 1.0) * (n + 1.0) / 12.0
-    slope = state[_LR_SLOPE]
+    slope = lr_state[_LR_SLOPE]
     ss_reg = (slope * slope) * (denom / n)
-    ss_tot = state[_LR_SUM_YY] - (state[_LR_SUM_Y] * state[_LR_SUM_Y]) / n
+    ss_tot = (
+        lr_state[_LR_SUM_YY] -
+        (lr_state[_LR_SUM_Y] * lr_state[_LR_SUM_Y]) / n)
     return ss_reg / ss_tot
 
 
-def _expand_lr_resi(impl: WindowedLinearRegressionResiImpl,
-                    state: List[OpBase], window: int) -> OpBase:
+def _expand_lr_resi(op: OpBase,
+                    state: Dict[OpBase, List[OpBase]]) -> OpBase:
+    lin_op = op.inputs[0]
+    lr_state = state[lin_op]
     # residual at the newest window position (x = window-1):
     #   v_t - (slope * (window-1) + intercept)
-    pred = state[_LR_SLOPE] * float(window - 1) + state[_LR_INTERCEPT]
-    return state[_LR_V] - pred
+    pred = (
+        lr_state[_LR_SLOPE] * float(lin_op.attrs["window"] - 1) +
+        lr_state[_LR_INTERCEPT])
+    return lr_state[_LR_V] - pred
 
 
 # ── SetInfOrNanToValue expansion ────────────────────────────────────
@@ -189,6 +200,46 @@ def _expand_set_inf_or_nan(op: SetInfOrNanToValue) -> OpBase:
     return Select(mask, ConstantOp(op.attrs["value"]), a)
 
 
+# ── DecayLinear reduction expansion ─────────────────────────────────
+
+def _expand_decay_linear(op: ReduceDecayLinear) -> OpBase:
+    if len(op.inputs) != 1:
+        raise RuntimeError(
+            f"experimental_expand: ReduceDecayLinear expects one input "
+            f"(op = {op})")
+    window = int(op.attrs["window"])
+    denom = (1.0 + window) * window / 2.0
+    loop = op.get_loop()
+    with loop:
+        idx = WindowLoopIndex(loop)
+        weight = (idx + 1.0) * (1.0 / denom)
+        contrib = op.inputs[0] * weight
+    return ReduceAdd(contrib)
+
+
+# ── Dispatch table helpers ──────────────────────────────────────────
+
+ExpandFunc = Callable[[OpBase, Dict[OpBase, List[OpBase]]], OpBase]
+ExpandRule = Tuple[Type[OpBase], ExpandFunc]
+
+
+_EXPAND_RULES: List[ExpandRule] = [
+    (ExpMovingAvg, lambda op, state: _expand_ema(op)),
+    (WindowedLinearRegressionSlopeImpl, _expand_lr_slope),
+    (WindowedLinearRegressionRSqaureImpl, _expand_lr_rsquare),
+    (WindowedLinearRegressionResiImpl, _expand_lr_resi),
+    (SetInfOrNanToValue, lambda op, state: _expand_set_inf_or_nan(op)),
+    (ReduceDecayLinear, lambda op, state: _expand_decay_linear(op)),
+]
+
+
+def _find_expand_rule(op: OpBase) -> Optional[ExpandFunc]:
+    for op_type, expand in _EXPAND_RULES:
+        if isinstance(op, op_type):
+            return expand
+    return None
+
+
 # ── Pass driver ─────────────────────────────────────────────────────
 
 def _experimental_expand_impl(
@@ -203,15 +254,6 @@ def _experimental_expand_impl(
     for op in ops:
         op.replace_inputs(replace_map)
 
-        if isinstance(op, ExpMovingAvg):
-            b = Builder(op.get_parent())
-            with b:
-                new_val = _expand_ema(op)
-            out.extend(b.ops)
-            replace_map[op] = new_val
-            changed = True
-            continue
-
         if isinstance(op, WindowedLinearRegression):
             b = Builder(op.get_parent())
             with b:
@@ -226,42 +268,11 @@ def _experimental_expand_impl(
             changed = True
             continue
 
-        if isinstance(op, WindowedLinearRegressionSlopeImpl):
-            lin_op = op.inputs[0]
-            b = Builder(op.get_parent())
-            with b:
-                new_val = _expand_lr_slope(op, state[lin_op])
-            out.extend(b.ops)
-            replace_map[op] = new_val
-            changed = True
-            continue
-
-        if isinstance(op, WindowedLinearRegressionRSqaureImpl):
-            lin_op = op.inputs[0]
-            window = lin_op.attrs["window"]
-            b = Builder(op.get_parent())
-            with b:
-                new_val = _expand_lr_rsquare(op, state[lin_op], window)
-            out.extend(b.ops)
-            replace_map[op] = new_val
-            changed = True
-            continue
-
-        if isinstance(op, WindowedLinearRegressionResiImpl):
-            lin_op = op.inputs[0]
-            window = lin_op.attrs["window"]
-            b = Builder(op.get_parent())
-            with b:
-                new_val = _expand_lr_resi(op, state[lin_op], window)
-            out.extend(b.ops)
-            replace_map[op] = new_val
-            changed = True
-            continue
-
-        if isinstance(op, SetInfOrNanToValue):
+        expand = _find_expand_rule(op)
+        if expand is not None:
             b = Builder(op.get_parent())
             with b:
-                new_val = _expand_set_inf_or_nan(op)
+                new_val = expand(op, state)
             out.extend(b.ops)
             replace_map[op] = new_val
             changed = True
diff --git a/KunQuant/passes/TempWindowElim.py b/KunQuant/passes/TempWindowElim.py
index 324fc93..bf51eb2 100644
--- a/KunQuant/passes/TempWindowElim.py
+++ b/KunQuant/passes/TempWindowElim.py
@@ -1,5 +1,8 @@
 from KunQuant.passes.Util import kun_pass
-from KunQuant.Op import OpBase, WindowedTempOutput, Input, Output, traverse_replace_map
+from KunQuant.Op import (
+    OpBase, WindowedTempOutput, Input, Output, WindowedTrait,
+    traverse_replace_map,
+)
 from KunQuant.Stage import Function
 from typing import List, Dict, Tuple
 
@@ -9,13 +12,18 @@ def _get_temp_out_with_window(op: OpBase, window: int):
     w = op.attrs["window"]
     return w >= window, w
 
-def for_each_op(op: OpBase, f: Function, replace_map: dict) -> Tuple[OpBase, OpBase]:
+def for_each_op(op: OpBase, f: Function, replace_map: dict, may_slice_time: bool) -> Tuple[OpBase, OpBase]:
     if not isinstance(op, WindowedTempOutput):
         return (op, None)
     inp = op.inputs[0]
     # temp window on input, simply eliminate it
     if isinstance(inp, Input):
         return (None, inp)
+    # If nobody consumes this as a windowed source, the temp window is just
+    # the current input value and can be folded away.
+    if not any(isinstance(user, WindowedTrait)
+               for user in f.op_to_id[op].uses):
+        return (None, inp)
     # check if the input of WindowedTempOutput is used in Output or other WindowedTempOutput
     inp_info = f.op_to_id[inp]
     window = op.attrs["window"]
@@ -24,8 +32,10 @@ def for_each_op(op: OpBase, f: Function, replace_map: dict) -> Tuple[OpBase, OpB
     for user, _ in inp_info.uses.items():
         if user == op:
             continue
-        # if the user is used by Output, return the output
-        if isinstance(user, Output):
+        # if the user is used by Output, return the output.  When the
+        # runtime may slice time, reading history from an output buffer can
+        # race across time chunks; keep a local temp window instead.
+        if not may_slice_time and isinstance(user, Output):
             return (None, traverse_replace_map(user, replace_map))
         # select the max window op with the larger id
         checked, w = _get_temp_out_with_window(user, window)
@@ -46,15 +56,19 @@ def _unwrap_output_wto(ops: List[OpBase], f: Function) -> bool:
         src = op.inputs[0]
         if not isinstance(src, WindowedTempOutput):
             continue
+        old_src = src
         while isinstance(src, WindowedTempOutput):
             src = src.inputs[0]
+        if op in f.op_to_id[old_src].uses:
+            del f.op_to_id[old_src].uses[op]
         f.op_to_id[src].uses[op] = 1
         op.inputs[0] = src
         changed = True
     return changed
 
-def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
+def temp_window_elim_impl(ops: List[OpBase], f: Function, options: dict) -> List[OpBase]:
     _unwrap_output_wto(ops, f)
+    may_slice_time = options.get("may_slice_time", False)
     replace_map = dict()
     out = []
     changed = False
@@ -62,7 +76,7 @@ def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
         if op in replace_map:
             continue
         op.replace_inputs(replace_map)
-        normal, replacer = for_each_op(op, f, replace_map)
+        normal, replacer = for_each_op(op, f, replace_map, may_slice_time)
         if normal is not None:
             out.append(op)
         else:
@@ -74,7 +88,7 @@ def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
 
 @kun_pass
 def temp_window_elim(f: Function, options: dict = {}):
-    newops = temp_window_elim_impl(f.ops, f)
+    newops = temp_window_elim_impl(f.ops, f, options)
     if newops is not None:
         newops = Function.topo_sort_ops(newops)
-        f.set_ops(newops)
\ No newline at end of file
+        f.set_ops(newops)
diff --git a/doc/Customize.md b/doc/Customize.md
index 5b4a8f5..f8aca63 100644
--- a/doc/Customize.md
+++ b/doc/Customize.md
@@ -138,6 +138,7 @@ The `CppCompilerConfig` controls how KunQuant calls the C++ compiler. To choose
 | fast_log | Use KunQuant's implementation of math log function instead of `std::log` |  bool  |  True  |
 | no_fast_stat | Disable fast rolling algorithm for statistics functions like stddev/corr/etc. Setting this flag to True may help to get better precision with the cost of performance. KunQuant will warn the precision issue if `options['no_fast_stat']==False`. To disable the warning and set no_fast_stat to False, set `options['no_fast_stat']=='no_warn'` |  bool or Literal\["no_warn"\]  |  If dtype is float or in stream mode, True. Otherwise, False |
 | no_skip_list | Disable the skip-list decompose path for large-window WindowedMin/WindowedMax/TsArgMin/TsArgMax/TsRank, falling back to the naive `ForeachBackWindow + Reduce*` lowering regardless of window/blocking_len cost.  `WindowedQuantile` has no non-skip-list path and will raise when this is set.  Set automatically by the GPU backend (`KunQuant.jit.cuda`) because the kunir codegen does not lower `SkipList*` ops. |  bool  |  False (CPU); forced True on GPU |
+| may_slice_time | Tell optimization passes that the runtime may execute one graph over multiple time chunks in parallel. When this is True, `TempWindowElim` will not aggressively optimize the temp windows. |  bool  |  False; GPU backend defaults to True unless explicitly set |
 
 ## Specifing Memory layouts and data types and enabling AVX512
 
@@ -186,4 +187,4 @@ There are some configurable options of function `compileit(...)` above that may
  * Input and output memory layout: `compileit(input_layout=?, output_layout=?)`. This affects how data are arranged in memory. Usually `STs` layout is faster than `TS` but may require some additional memory movement when you call the factor library.
  * Partition factor: `compileit(partition_factor=some_int)`. A larger Partition factor will put more computations in a single generated function in C++. Enlarging Partition factor may reduce the overhead of thread-scheduling and eliminate some of the temp buffers. However, if the factor is too high, the generated C++ code will suffer from register-spilling.
  * Blocking len: `compileit(blocking_len=some_int)`. It selects AVX2 or AVX512 instruction sets. Using AVX512 might have some slight performance gain over AVX2.
- * Unaligned stock number: `compileit(allow_unaligned=some_bool)`. By default `True`. When `allow_unaligned` is set to false, the generated C++ code will assume the number of stocks to be aligned with the SIMD length (e.g., 8 float32 on AVX2). This will slightly improve the performance.
\ No newline at end of file
+ * Unaligned stock number: `compileit(allow_unaligned=some_bool)`. By default `True`. When `allow_unaligned` is set to false, the generated C++ code will assume the number of stocks to be aligned with the SIMD length (e.g., 8 float32 on AVX2). This will slightly improve the performance.
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index e1b8067..f4099f0 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -491,6 +491,11 @@ NB_MODULE(KunMLIR, m) {
       .def_prop_ro("output_names",  &kun_cuda::Executable::graphOutputs,
             "Graph-level output names — match this against the keys of the "
             "args dict you pass to launch().")
+      .def("getOutputNames",
+            [](const kun_cuda::Executable &e) {
+              return e.graphOutputs();
+            },
+            "CPU-runtime-compatible alias for `output_names`.")
       .def_prop_ro("warps_per_cta", &kun_cuda::Executable::warpsPerCta)
       .def_prop_ro("vector_size",   &kun_cuda::Executable::vectorSize)
       .def_prop_ro("num_kernels",
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index a73d08b..6831e07 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -112,6 +112,23 @@ def build_func_backref(N: int) -> Function:
     return Function(builder.ops, name="backref_kernel")
 
 
+def build_func_output_backref(N: int, delay: int) -> Function:
+    """raw = WindowedSum(a, N); delayed = BackRef(raw, delay).
+
+    `raw` is also a graph output.  When the runtime slices time, the
+    optimizer must keep a local WindowedTempOutput for the BackRef source
+    rather than reusing the graph output buffer: warmup rows in each chunk
+    are computed but masked from graph-output stores.
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        raw = WindowedSum(a, N)
+        Output(raw, "raw")
+        Output(BackRef(raw, delay), "delayed")
+    return Function(builder.ops, name="output_backref_kernel")
+
+
 def build_func_fastwindowedsum(N: int) -> Function:
     """ws = FastWindowedSum(a + b, N) — same windowed-sum semantics as
     WindowedSum, but uses the stateful Kahan-corrected algorithm from
@@ -605,6 +622,60 @@ def run_backref_with_mask(target: str, T: int, S: int, N: int,
                                   valid_start=valid_start, atol=1e-5)
 
 
+def run_output_backref_multichunk(target: str, T: int, S: int,
+                                    N: int) -> int:
+    """Regression test for TempWindowElim under time slicing.
+
+    The graph outputs `raw = WindowedSum(a, N)` and also consumes `raw`
+    through `BackRef(raw, 2)`.  With multi-chunk launches, replacing the
+    BackRef's local temp window with the graph output buffer is wrong:
+    chunk warmup rows are intentionally not stored to graph outputs, and
+    peer chunks are not globally synchronized inside one kernel launch.
+    """
+    delay = 2
+    print(f"=== output-backed BackRef regression: raw=WindowedSum(a, N={N}), "
+           f"delayed=raw[t-{delay}] ===")
+    f = build_func_output_backref(N, delay)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(17)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    raw_out = cp.zeros((T, S), dtype=cp.float32)
+    delayed_out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h)},
+                       outputs={"raw": raw_out, "delayed": delayed_out})
+    raw_h = cp.asnumpy(raw_out)
+    delayed_h = cp.asnumpy(delayed_out)
+
+    cumsum = np.cumsum(a_h, axis=0, dtype=np.float64)
+    raw_expected = np.empty((T, S), dtype=np.float32)
+    raw_expected[:N - 1] = np.nan
+    raw_expected[N - 1] = cumsum[N - 1]
+    if T > N:
+        raw_expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+
+    delayed_expected = np.empty((T, S), dtype=np.float32)
+    delayed_expected[:N - 1 + delay] = np.nan
+    delayed_expected[N - 1 + delay:] = raw_expected[N - 1:T - delay]
+
+    rc = 0
+    rc |= _compare_post_warmup(raw_h, raw_expected,
+                                  valid_start=N - 1,
+                                  atol=max(1e-3, 5e-7 * N))
+    rc |= _compare_post_warmup(delayed_h, delayed_expected,
+                                  valid_start=N - 1 + delay,
+                                  atol=max(1e-3, 5e-7 * N))
+    return rc
+
+
 def run_library(target: str, T: int, S: int) -> int:
     """Exercise the multi-Function `compileit` shape and `Library.getModule`,
     plus the auto-allocated-output path on `Executor.runGraph` (omitting
@@ -705,6 +776,9 @@ def main() -> int:
     rc |= run_backref_with_mask(args.target, args.time_length, args.num_stocks,
                                   args.window, mask=3)
     print()
+    rc |= run_output_backref_multichunk(args.target, args.time_length,
+                                          args.num_stocks, args.window)
+    print()
     rc |= test_windowed(windowed_exe, args.time_length, args.num_stocks,
                           args.window, mask=3)
     print()
diff --git a/tests/KunTestUtil/ref_alpha101.py b/tests/KunTestUtil/ref_alpha101.py
index 41f6970..965b337 100644
--- a/tests/KunTestUtil/ref_alpha101.py
+++ b/tests/KunTestUtil/ref_alpha101.py
@@ -193,7 +193,8 @@ def decay_linear(df, period=10):
     # The backtest engine should assure to be snooping bias free.
     for row in range(period - 1, df.shape[0]):
         x = na_series[row - period + 1: row + 1, :]
-        na_lwma[row, :] = (np.dot(x.T, y))
+        with np.errstate(invalid="ignore"):
+            na_lwma[row, :] = np.dot(x.T, y)
     return pd.DataFrame(na_lwma, index=df.index, columns=df.columns)  
 # endregion
 
@@ -834,5 +835,3 @@ def alpha099(self):
     # Alpha#101	 ((close - open) / ((high - low) + .001))
     def alpha101(self):
         return (self.close - self.open) /((self.high - self.low) + 0.001)
-     
-     
\ No newline at end of file
diff --git a/tests/test.py b/tests/test.py
index 2dc4480..7302551 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -213,6 +213,77 @@ def check_tempwindow_elim():
 v7 = Output@{name:r30}(v4)
 v8 = Output@{name:r20}(v6)''')
 
+    # case 6, when time slicing is allowed, keep a local temp window
+    # instead of reading history from the output buffer.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 10)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 10))
+        Output(sq, "xport")
+        Output(v1, "reduced")
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = WindowedTempOutput@{window:10}(v1)
+v3 = ForeachBackWindow@{window:10}(v2)
+v4 = ReduceAdd@(v3)
+v5 = Output@{name:xport}(v1)
+v6 = Output@{name:reduced}(v4)''')
+
+    # case 7, may_slice_time still allows Input and larger-temp-window
+    # replacement; only Output replacement is disabled.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        wto = WindowedTempOutput(inp, 10)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 10))
+        Output(v1)
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = ForeachBackWindow@{window:10}(v0)
+v2 = ReduceAdd@(v1)
+v3 = Output@{name:}(v2)''')
+
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto10 = WindowedTempOutput(sq, 10)
+        wto15 = WindowedTempOutput(sq, 15)
+        v1 = ReduceAdd(ForeachBackWindow(wto10, 10))
+        v2 = ReduceAdd(ForeachBackWindow(wto15, 10))
+        Output(sq, "xport")
+        Output(v1, "r10")
+        Output(v2, "r15")
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    for op in f.ops:
+        if isinstance(op, ForeachBackWindow):
+            if isinstance(op.inputs[0], Output):
+                raise RuntimeError("may_slice_time replaced temp with Output")
+            if not isinstance(op.inputs[0], WindowedTempOutput):
+                raise RuntimeError("larger temp window replacement failed")
+
+    # case 8, if no windowed op consumes the temp window, it can be
+    # replaced by its input even with may_slice_time enabled.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 10)
+        v1 = AddConst(wto, 1)
+        Output(v1, "out")
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = AddConst@{value:1}(v1)
+v3 = Output@{name:out}(v2)''')
+
 def check_window():
     # case 1, temp window on input
     builder = Builder()
@@ -550,4 +621,4 @@ def check_pow():
     check_mergeLoop()
     check_toposort()
     check_duplicate_rank_out()
-    check_duplicate_rank_in()
\ No newline at end of file
+    check_duplicate_rank_in()
diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index c778672..3219319 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -1,12 +1,14 @@
 from KunQuant.Driver import KunCompilerConfig
 from KunTestUtil import ref_alpha101, gen_data
+import argparse
+import dataclasses
 import numpy as np
 import pandas as pd
 import sys
 import time
 import os
 from KunQuant.jit import cfake
-from KunQuant.Op import Builder, Input, Output
+from KunQuant.Op import Builder, Input, Output, Scale
 from KunQuant.Stage import Function
 from KunQuant.predefined.Alpha101 import AllData, all_alpha
 from KunQuant.runner import KunRunner as kr
@@ -14,6 +16,52 @@
 
 isx86 = cpu_arch != "aarch64"
 
+_argp = argparse.ArgumentParser(add_help=False)
+_argp.add_argument("action", nargs="?")
+_argp.add_argument("--gpu-arch", default="")
+_args, _ = _argp.parse_known_args()
+action = _args.action or ("run_gpu" if _args.gpu_arch else "avx2")
+GPU_ARCH = _args.gpu_arch or ("sm_80" if action == "run_gpu" else "")
+GPU_MODE = bool(GPU_ARCH)
+
+if GPU_MODE:
+    import cupy as cp
+    from KunQuant.jit import KunMLIR as _kr_mlir
+    from KunQuant.jit import cuda as _cuda_jit
+
+    cp.cuda.Device(0).use()
+    cp.zeros((1,), dtype=cp.float32)
+
+
+_GPU_SKIP_DEP_TYPES = (Scale,)
+
+
+def _depends_on_type(op, dep_types, seen=None):
+    if seen is None:
+        seen = set()
+    if op in seen:
+        return False
+    seen.add(op)
+    if isinstance(op, dep_types):
+        return True
+    return any(_depends_on_type(inp, dep_types, seen) for inp in op.inputs)
+
+
+def _filter_outputs_for_gpu(f: Function) -> None:
+    kept = []
+    dropped = []
+    for op in f.ops:
+        if isinstance(op, Output) and _depends_on_type(op.inputs[0],
+                                                       _GPU_SKIP_DEP_TYPES):
+            dropped.append(op.attrs["name"])
+            continue
+        kept.append(op)
+    if dropped:
+        print(f"[gpu] dropping {len(dropped)} unsupported outputs: "
+              f"{dropped}")
+    f.set_ops(kept)
+
+
 def get_simd_len(avx: str, dtype: str = "float"):
     element_width = 32 if dtype == "float" else 64
     if avx == "avx512":
@@ -130,6 +178,42 @@ def TS_ST(data: np.ndarray) -> np.ndarray:
 def ST_TS(data: np.ndarray) -> np.ndarray:
     return np.ascontiguousarray(data.transpose())
 
+
+def get_output_layout(modu):
+    return "TS" if GPU_MODE else modu.output_layout
+
+
+def create_single_thread_executor():
+    return _kr_mlir.Executor() if GPU_MODE else kr.createSingleThreadExecutor()
+
+
+def create_multi_thread_executor(n):
+    return _kr_mlir.Executor() if GPU_MODE else kr.createMultiThreadExecutor(n)
+
+
+def run_graph(executor, modu, inputs, cur_time, length, outputs=None, **kwargs):
+    if not GPU_MODE:
+        return kr.runGraph(executor, modu, inputs, cur_time, length,
+                           outputs if outputs is not None else {}, **kwargs)
+    if cur_time != 0:
+        raise RuntimeError("GPU alpha101 test only supports cur_time=0")
+    gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
+    ret = executor.runGraph(modu, gpu_inputs, cur_time=cur_time,
+                            length=length)
+    executor.synchronize()
+
+    out_np = {}
+    for k, v in ret.items():
+        arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+        host = cp.asnumpy(arr)
+        if outputs is not None and k in outputs:
+            outputs[k][...] = host
+            out_np[k] = outputs[k]
+        else:
+            out_np[k] = host
+    return out_np
+
+
 def make_data_and_ref(num_stock, num_time, ischeck, input_ST8t, dtype="float32"):
     rng = np.random.get_state()
     start = time.time()
@@ -263,7 +347,9 @@ def check_result(out, ref, outnames, start_window, num_stock, start_time, num_ti
         cur_rtol = tolerance["rtol"].get(k, rtol)
         cur_atol = tolerance["atol"].get(k, atol)
         check_start = 0
-        if start_time or k in tolerance["skip_head"]:
+        # GPU kernels do not match the CPU/pandas partial-window warmup rows even
+        # when start_time == 0, so the nonzero start_time skip is not enough.
+        if GPU_MODE or start_time or k in tolerance["skip_head"]:
             check_start = start_window[k] + start_time
         v = out[k][:,check_start-start_time:]
         refv = ref[k][check_start:].to_numpy().transpose()
@@ -295,9 +381,12 @@ def check_result(out, ref, outnames, start_window, num_stock, start_time, num_ti
     return done
 
 def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ischeck, start_time):
+    if GPU_MODE and start_time != 0:
+        print(f"[skip on GPU] start_time={start_time}")
+        return True
     # prepare outputs
     outnames = modu.getOutputNames()
-    layout = modu.output_layout
+    layout = get_output_layout(modu)
     outbuffers = dict()
     print(layout)
     if layout == "TS":
@@ -310,15 +399,20 @@ def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ische
     # blocked = TS_STs(inp)
     
     if not ischeck:
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        out = run_graph(executor, modu, my_input, start_time,
+                        num_time-start_time, outbuffers)
         start = time.time()
         for _ in range(20):
-            out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
+            out = run_graph(executor, modu, my_input, start_time,
+                            num_time-start_time, outbuffers,
+                            skip_check=True)
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, num_stocks = num_stock)
+        out = run_graph(executor, modu, my_input, start_time,
+                        num_time-start_time, outbuffers,
+                        num_stocks=num_stock)
         end = time.time()
         tdiff = end-start
     print(f"Exec takes: {tdiff:.6f} seconds")
@@ -372,9 +466,12 @@ def streammain(num_stock):
 
 
 def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, ischeck, start_time):
+    if GPU_MODE and start_time != 0:
+        print(f"[skip on GPU] start_time={start_time}")
+        return True
     # prepare outputs
     outnames = modu.getOutputNames()
-    layout = modu.output_layout
+    layout = get_output_layout(modu)
     outbuffers = dict()
     print(layout)
     if layout == "TS":
@@ -386,15 +483,19 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
     # print(ref.alpha001())
     # blocked = TS_STs(inp)
     if not ischeck:
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        out = run_graph(executor, modu, my_input, start_time,
+                        num_time-start_time, outbuffers)
         start = time.time()
         for _ in range(20):
-            out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
+            out = run_graph(executor, modu, my_input, start_time,
+                            num_time-start_time, outbuffers,
+                            skip_check=True)
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        out = run_graph(executor, modu, my_input, start_time,
+                        num_time-start_time, outbuffers)
         end = time.time()
         tdiff = end-start
     print(f"Exec takes: {tdiff:.6f} seconds")
@@ -414,18 +515,19 @@ def main(is64: bool, is_check: bool):
     num_stock = 64
     done = True
     testfunc = test64 if is64 else test
-    blocking_num = modu.blocking_len
+    blocking_num = 1 if GPU_MODE else modu.blocking_len
     # fp64 version is compiled with TS format
-    blocking = 0 if is64 else blocking_num
+    blocking = 0 if is64 or GPU_MODE else blocking_num
     def compute():
         nonlocal done
         num_time = 260
         my_input, pd_ref = make_data_and_ref(num_stock, num_time, is_check, blocking, "float64" if is64 else "float32")
-        executor = kr.createSingleThreadExecutor()
+        executor = create_single_thread_executor()
         done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
         done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
-        executor = kr.createMultiThreadExecutor(4)
-        done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
+        if not GPU_MODE:
+            executor = create_multi_thread_executor(4)
+            done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
     num_stock = 64
     compute()
     # skip benchmarking on unaligned mode
@@ -439,13 +541,24 @@ def compute():
     if not done:
         exit(1)
 
-action = sys.argv[1]
 def do_compile(avx, keep, tempdir):
     funclist = [
         check_alpha101(avx),
         check_alpha101_stream(avx),
         check_alpha101_double(avx)
         ]
+    if GPU_MODE:
+        gpu_funclist = []
+        for name, f, kcfg in funclist:
+            if name == "alpha_101_stream":
+                continue
+            _filter_outputs_for_gpu(f)
+            kcfg = dataclasses.replace(kcfg, input_layout="TS",
+                                       output_layout="TS",
+                                       blocking_len=1)
+            gpu_funclist.append((name, f, kcfg))
+        ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=GPU_ARCH)
+        return _cuda_jit.compileit(gpu_funclist, "test", ccfg)
     if avx == "avx512":
         machine = cfake.X64CPUFlags(avx512=True, avx512dq=True, avx512vl=True)
     else:
@@ -464,10 +577,11 @@ def do_compile(avx, keep, tempdir):
 print("======================================")
 print("Check f32 batch")
 main(False, True)
-print("======================================")
-print("Check f32 stream")
-streammain(64)
-if action != "run_avx512" and isx86:
+if not GPU_MODE:
+    print("======================================")
+    print("Check f32 stream")
+    streammain(64)
+if not GPU_MODE and action != "run_avx512" and isx86:
     print("======================================")
     print("Check f32 stream unaligned")
     streammain(63)

From 3fee085219a2d46516a5f6565b75318e3d799d05 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 21 May 2026 03:08:36 -0700
Subject: [PATCH 41/59] scale op

---
 KunQuant/passes/CodegenMLIR.py       |  12 +--
 mlir/include/KunCuda/Runtime.h       |  32 ++++---
 mlir/lib/KunCuda/CMakeLists.txt      |   2 +-
 mlir/lib/KunCuda/EmbedFile.cmake     |   2 +-
 mlir/lib/KunCuda/Runtime.cpp         | 131 ++++++++++++++++++---------
 mlir/lib/KunCuda/kernels/cs_scale.cu | 110 ++++++++++++++++++++++
 mlir/lib/Python/MlirBinding.cpp      |  10 +-
 tests/test_alpha101.py               |  33 +------
 8 files changed, 235 insertions(+), 97 deletions(-)
 create mode 100644 mlir/lib/KunCuda/kernels/cs_scale.cu

diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index e710e08..6cef15f 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -9,7 +9,7 @@
 Scope (v0): only the ops kunir currently supports.
   - Elemwise binary: Add, Sub, Mul, Div, Max, Min
   - Elemwise unary:  Abs, Log, Sign
-  - Cross-sectional: Rank
+  - Cross-sectional: Rank, Scale
   - Windowed:        WindowedTempOutput, ForeachBackWindow + IterValue,
                       ReduceAdd / ReduceMul / ReduceMax / ReduceMin
   - Boundaries:      Input, Output
@@ -29,7 +29,7 @@
 from KunQuant.Op import (
     OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
     WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
-    WindowedTrait, Rank,
+    WindowedTrait, Rank, Scale,
 )
 from KunQuant.ops.ElewiseOp import (
     Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
@@ -69,7 +69,7 @@
 _UNARY = {
     Abs: "abs", Log: "log", Exp: "exp", Sqrt: "sqrt", Sign: "sign",
     Not: "not_",
-    # NOTE: `Rank` is intentionally absent.  Cross-sectional rank
+    # NOTE: cross-sectional ops are intentionally absent.
     # partitions are routed to a pre-compiled CUmodule by
     # `_maybe_external_partition` below; they never become kunir ops.
 }
@@ -250,10 +250,10 @@ def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
     Detection mirrors CodegenCpp's "simple cross-sectional fast path"
     (CodegenCpp.codegen_cpp's `len(f.ops) == 3` check): a partition
     whose only compute op is a supported `SimpleCrossSectionalOp`
-    (currently Rank).  The partitioner places every CrossSectionalOp into its own
+    (currently Rank or Scale).  The partitioner places every CrossSectionalOp into its own
     partition without other compute, so this shape is what we get.
 
-    The `kind` string is `cs_rank_f{32,64}`.  Do not fabricate kinds for
+    The `kind` string is `cs_<op>_f{32,64}`.  Do not fabricate kinds for
     cross-sectional ops unless the C++ runtime has a matching bundled
     external kernel.
     """
@@ -261,7 +261,7 @@ def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
                 if not isinstance(op, (Input, Output))]
     if len(compute) != 1 or not isinstance(compute[0], SimpleCrossSectionalOp):
         return None
-    if not isinstance(compute[0], Rank):
+    if not isinstance(compute[0], (Rank, Scale)):
         return None
     inputs  = [op for op in f.ops if isinstance(op, Input)]
     outputs = [op for op in f.ops if isinstance(op, Output)]
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index eb8eee7..d5d8ea8 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -65,14 +65,16 @@ class Executor;
 /// Kernel dispatch kind.  `Jit` kernels live in the cubin produced by
 /// the MLIR pipeline and are launched with the project-wide stock-major
 /// grid (block_x = warps_per_cta * 32, grid_x = ceil(S / block_x)).
-/// `ExtCsRank*` kernels are pre-compiled PTX bundled inside
+/// `ExtCs*` kernels are pre-compiled PTX bundled inside
 /// libKunCudaRuntime; the executor lazy-loads them as a second
 /// CUmodule and launches them with a time-major grid + dynamic shared
 /// memory sized to the cross-section (one CTA per timestep).
 enum class KernelKind : int32_t {
-  Jit          = 0,
-  ExtCsRankF32 = 1,
-  ExtCsRankF64 = 2,
+  Jit           = 0,
+  ExtCsRankF32  = 1,
+  ExtCsRankF64  = 2,
+  ExtCsScaleF32 = 3,
+  ExtCsScaleF64 = 4,
 };
 
 /// Per-kernel element type.  Currently single-precision (f32) and
@@ -91,7 +93,7 @@ inline size_t bytesPerElem(Datatype dt) noexcept {
 /// produce by walking a single lowered llvm.func — no graph topology
 /// reasoning required.
 struct KernelMeta {
-  std::string kernelName;                    ///< symbol in the cubin (Jit) or in the bundled PTX (ExtCsRank*)
+  std::string kernelName;                    ///< symbol in the cubin (Jit) or in the bundled PTX (ExtCs*)
   KernelKind kind = KernelKind::Jit;         ///< picked by the MLIR pass; default is the regular path
   std::vector<std::string> inputNames;       ///< kungpu.input_names, in argv order
   std::vector<std::string> outputNames;      ///< kungpu.output_names, in argv order
@@ -99,7 +101,7 @@ struct KernelMeta {
   /// Drives the time-chunk grid: chunks ≥ 1 need this many extra time
   /// steps before they can start writing reliable outputs, and the
   /// chunk-size heuristic gates the minimum chunk size at K × warmup.
-  /// Always 0 for external (cs_rank) kernels — they don't multi-chunk.
+  /// Always 0 for external cross-sectional kernels — they don't multi-chunk.
   int64_t unreliableCount = 0;
 };
 
@@ -112,10 +114,10 @@ struct ExecutableData {
   std::vector<char> cubin;
   int64_t warpsPerCta = 1;          ///< from kungpu.target_spec (graph-wide).
                                      ///<   Drives JIT kernels' block_x.
-                                     ///<   External cs_rank kernels IGNORE
+                                     ///<   External cross-sectional kernels IGNORE
                                      ///<   this — they auto-tune block_x
                                      ///<   from numStocks (see
-                                     ///<   launchExtCsRankKernel).
+                                     ///<   launchExtCsKernel).
   int64_t vectorSize  = 1;          ///< from kungpu.target_spec (graph-wide)
   Datatype dtype      = Datatype::Float;  ///< element type of every kernel
                                            ///<   I/O.  Graph-wide; verified
@@ -210,7 +212,7 @@ class Executable {
   /// `devMaxSmemBytes` is the device's MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
   /// cached by the caller (`Executor`) so the runtime can validate
   /// `num_stocks * sizeof(T)` against the GPU's smem cap before
-  /// invoking cuLaunchKernel for external cs_rank kernels.  Pass 0 if
+  /// invoking cuLaunchKernel for external cross-sectional kernels.  Pass 0 if
   /// there are no external kernels in the executable (the check is a
   /// no-op in that case).
   ///
@@ -227,11 +229,12 @@ class Executable {
   ///     compute.
   ///   - `smFillFactor` (≥ 0) is the target chunks-on-GPU multiplier:
   ///     JIT uses `num_chunks * stock_tiles ≥ smFillFactor * numSMs`;
-  ///     cs_rank uses `num_time_chunks ≥ smFillFactor * numSMs`.  1.0
+  ///     external cross-sectional kernels use
+  ///     `num_time_chunks ≥ smFillFactor * numSMs`.  1.0
   ///     just fills the GPU; > 1 leaves slack for scheduler latency
   ///     hiding.
   /// `exec` owns the CUDA stream + the cached device attributes
-  /// (`devMaxSmemBytes()`, `numSMs()`).  External (cs_rank) kernels
+  /// (`devMaxSmemBytes()`, `numSMs()`).  External cross-sectional kernels
   /// ignore the multi-chunk params — they keep their own auto-tune
   /// path using the same Executor accessors.
   void launchOnStream(Executor *exec,
@@ -252,10 +255,11 @@ class Executable {
   std::unique_ptr<GraphPlan> plan_;          ///< pImpl — defined in Runtime.cpp
 
   CUmodule cuModule_ = nullptr;
-  /// Module holding the pre-compiled cs_rank PTX.  Loaded at
-  /// construction time iff any kernel has `kind != Jit`; null
+  /// Modules holding pre-compiled cross-sectional PTX.  Loaded at
+  /// construction time iff a matching external kernel is present; null
   /// otherwise.
   CUmodule csRankModule_ = nullptr;
+  CUmodule csScaleModule_ = nullptr;
   std::vector<CUfunction> cuFuncs_;          ///< parallel to data_.kernels
 
   // Lazily allocated intermediate buffers, one CUdeviceptr per slot
@@ -322,7 +326,7 @@ class Executor {
   CUstream stream() const noexcept { return stream_; }
   /// Cached MAX_SHARED_MEMORY_PER_BLOCK_OPTIN of the device this
   /// Executor's CUcontext is bound to, queried once at construction.
-  /// Used to validate cs_rank dynamic-smem requests at launch time
+  /// Used to validate external cross-sectional dynamic-smem requests at launch time
   /// without a per-launch driver call.
   int devMaxSmemBytes() const noexcept { return devMaxSmemBytes_; }
   /// Cached MULTIPROCESSOR_COUNT of the device this Executor's CUcontext
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 434d1b6..2140fe3 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -72,7 +72,7 @@ function(kun_add_bundled_ptx_kernel cu_path)
               -DSYMBOL=${_symbol}
               -DPTX_VERSION=7.8
               -P "${_kun_embed_cmake}"
-      DEPENDS ${_obj_tgt} "${_kun_embed_cmake}"
+      DEPENDS ${_obj_tgt} $<TARGET_OBJECTS:${_obj_tgt}> "${_kun_embed_cmake}"
       COMMENT "Embedding ${_stem}.ptx as ${_symbol}[] (downgrading to ISA 7.8)"
       VERBATIM
       COMMAND_EXPAND_LISTS)
diff --git a/mlir/lib/KunCuda/EmbedFile.cmake b/mlir/lib/KunCuda/EmbedFile.cmake
index 07eed3e..4b82922 100644
--- a/mlir/lib/KunCuda/EmbedFile.cmake
+++ b/mlir/lib/KunCuda/EmbedFile.cmake
@@ -42,7 +42,7 @@ string(REGEX REPLACE "(0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x
 file(WRITE "${OUTPUT}"
 "// Generated from \"${INPUT}\".  Do not edit by hand.
 static const unsigned char ${SYMBOL}[] = {
-  ${byte_list}
+  ${byte_list},0x00
 };
 static const unsigned int ${SYMBOL}_len = ${n_bytes};
 ")
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index c63f6ae..081739d 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -28,9 +28,9 @@
 #include <stdexcept>
 #include <unordered_map>
 
-// Pre-compiled cs_rank PTX, embedded by EmbedFile.cmake.  Exposes
-// `kun_cs_rank_ptx[]` (bytes) and `kun_cs_rank_ptx_len`.
+// Pre-compiled cross-sectional PTX, embedded by EmbedFile.cmake.
 #include "cs_rank_ptx.inc"
+#include "cs_scale_ptx.inc"
 
 namespace kun_cuda {
 
@@ -512,9 +512,9 @@ static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
   return {chunkSize, static_cast<unsigned>(numChunks)};
 }
 
-/// External cs_rank launch.
+/// External cross-sectional launch.
 ///
-/// Block / grid both auto-tuned — cs_rank is cross-sectional, so the
+/// Block / grid both auto-tuned — these kernels are cross-sectional, so the
 /// graph-wide `warps_per_cta` hint doesn't apply.
 ///
 ///   blockX = clamp(round_up(numStocks, 32), 32, 1024)
@@ -524,7 +524,7 @@ static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
 ///
 ///   gridX  = min(timeLength, ceil(smFillFactor * numSMs))
 ///       The kernel does a contiguous time-axis slice per CTA via a
-///       grid-stride loop (see kernels/cs_rank.cu).  For small T the
+///       grid-stride loop (see kernels/cs_*.cu).  For small T the
 ///       min clamps to 1 CTA per timestep (matches the pre-tuning
 ///       launch shape); for large T fewer CTAs each do more time
 ///       steps, reducing launch / scheduling overhead.
@@ -535,26 +535,44 @@ static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
 /// Falls back to (gridX = timeLength, blockX = 32) when the executor
 /// couldn't query `numSMs` from the device — degenerate "one CTA per
 /// timestep, one warp per CTA" still works correctly.
-static void launchExtCsRankKernel(CUfunction fn, KernelKind kind,
-                                    const std::string &kernelName,
-                                    int64_t timeLength, int64_t numStocks,
-                                    int devMaxSmemBytes,
-                                    double smFillFactor, int numSMs,
-                                    void **args, CUstream stream) {
-  size_t elemSize = (kind == KernelKind::ExtCsRankF64) ? 8u : 4u;
-  uint64_t smemBytes64 =
-      static_cast<uint64_t>(numStocks) * static_cast<uint64_t>(elemSize);
+static bool isF64ExternalKind(KernelKind kind) {
+  return kind == KernelKind::ExtCsRankF64 ||
+         kind == KernelKind::ExtCsScaleF64;
+}
+
+static bool isCsRankKind(KernelKind kind) {
+  return kind == KernelKind::ExtCsRankF32 ||
+         kind == KernelKind::ExtCsRankF64;
+}
+
+static bool isCsScaleKind(KernelKind kind) {
+  return kind == KernelKind::ExtCsScaleF32 ||
+         kind == KernelKind::ExtCsScaleF64;
+}
+
+static void launchExtCsKernel(CUfunction fn, KernelKind kind,
+                                const std::string &kernelName,
+                                int64_t timeLength, int64_t numStocks,
+                                int devMaxSmemBytes,
+                                double smFillFactor, int numSMs,
+                                void **args, CUstream stream) {
+  size_t elemSize = isF64ExternalKind(kind) ? 8u : 4u;
+  uint64_t smemElems = static_cast<uint64_t>(numStocks);
+  if (isCsScaleKind(kind))
+    smemElems += 1;
+  uint64_t smemBytes64 = smemElems * static_cast<uint64_t>(elemSize);
 
   if (devMaxSmemBytes <= 0)
     throw std::runtime_error(
-        "kun_cuda::launchOnStream: external cs_rank kernel '" + kernelName +
+        "kun_cuda::launchOnStream: external cross-sectional kernel '" +
+        kernelName +
         "' requires Executor's devMaxSmemBytes to be set; got 0.  "
         "Construct the Executable through Executor::runGraph, or pass "
         "devMaxSmemBytes when calling launchOnStream directly.");
   if (smemBytes64 > static_cast<uint64_t>(devMaxSmemBytes))
     throw std::runtime_error(
-        "kun_cuda::launchOnStream: cs_rank dynamic smem "
-        "(num_stocks=" + std::to_string(numStocks) +
+        "kun_cuda::launchOnStream: cross-sectional dynamic smem "
+        "(elements=" + std::to_string(smemElems) +
         " * sizeof(T)=" + std::to_string(elemSize) + " = " +
         std::to_string(smemBytes64) +
         " bytes) exceeds this GPU's MAX_SHARED_MEMORY_PER_BLOCK_OPTIN (" +
@@ -588,7 +606,7 @@ static void launchExtCsRankKernel(CUfunction fn, KernelKind kind,
   unsigned smemBytes = static_cast<unsigned>(smemBytes64);
   checkCu(cuLaunchKernel(fn, gridX, 1, 1, blockX, 1, 1,
                            smemBytes, stream, args, nullptr),
-           "cuLaunchKernel(cs_rank)");
+           "cuLaunchKernel(external_cs)");
 }
 
 //===----------------------------------------------------------------------===//
@@ -611,25 +629,35 @@ static void loadJitCubin(const ExecutableData &data, CUmodule &outModule) {
           "' declared but no cubin supplied — this is a compile-side bug");
 }
 
-/// Lazy-load the bundled cs_rank PTX as a second CUmodule iff any
-/// kernel uses it.  The driver JITs PTX → SASS on first load (cached
+/// Lazy-load bundled external cross-sectional PTX modules iff any
+/// kernel uses them.  The driver JITs PTX → SASS on first load (cached
 /// system-wide in ~/.nv/ComputeCache), so this is sub-ms after the
 /// first run on a given GPU.
-static void loadCsRankPtxIfNeeded(const std::vector<KernelMeta> &kernels,
-                                    CUmodule &outModule) {
+static void loadExternalCsPtxIfNeeded(const std::vector<KernelMeta> &kernels,
+                                      CUmodule &csRankModule,
+                                      CUmodule &csScaleModule) {
+  bool needRank = false;
+  bool needScale = false;
   for (const auto &k : kernels) {
-    if (k.kind != KernelKind::Jit) {
-      checkCu(cuModuleLoadData(&outModule, kun_cs_rank_ptx),
-               "cuModuleLoadData(cs_rank.ptx)");
-      return;
-    }
+    needRank |= isCsRankKind(k.kind);
+    needScale |= isCsScaleKind(k.kind);
+    if (needRank && needScale)
+      break;
   }
+
+  if (needRank)
+    checkCu(cuModuleLoadData(&csRankModule, kun_cs_rank_ptx),
+             "cuModuleLoadData(cs_rank.ptx)");
+  if (needScale)
+    checkCu(cuModuleLoadData(&csScaleModule, kun_cs_scale_ptx),
+             "cuModuleLoadData(cs_scale.ptx)");
 }
 
 /// Pick the right CUmodule + symbol name for a kernel and resolve it.
 static CUfunction resolveOneKernelSymbol(const KernelMeta &k,
                                           CUmodule jitModule,
-                                          CUmodule csRankModule) {
+                                          CUmodule csRankModule,
+                                          CUmodule csScaleModule) {
   CUmodule mod = nullptr;
   const char *symbol = nullptr;
   switch (k.kind) {
@@ -645,6 +673,14 @@ static CUfunction resolveOneKernelSymbol(const KernelMeta &k,
       mod = csRankModule;
       symbol = "kun_cs_rank_f64";
       break;
+    case KernelKind::ExtCsScaleF32:
+      mod = csScaleModule;
+      symbol = "kun_cs_scale_f32";
+      break;
+    case KernelKind::ExtCsScaleF64:
+      mod = csScaleModule;
+      symbol = "kun_cs_scale_f64";
+      break;
   }
   CUfunction fn = nullptr;
   checkCu(cuModuleGetFunction(&fn, mod, symbol),
@@ -674,17 +710,24 @@ static void optInExternalSmemMax(const std::vector<KernelMeta> &kernels,
            "cuDeviceGetAttribute(MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)");
   for (size_t i = 0; i < funcs.size(); ++i) {
     if (kernels[i].kind == KernelKind::Jit) continue;
+    int staticSmem = 0;
+    checkCu(cuFuncGetAttribute(&staticSmem,
+                                CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                funcs[i]),
+             "cuFuncGetAttribute(SHARED_SIZE_BYTES)");
+    int dynamicMax = maxOptIn - staticSmem;
+    if (dynamicMax < 0) dynamicMax = 0;
     checkCu(cuFuncSetAttribute(
                 funcs[i],
                 CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-                maxOptIn),
+                dynamicMax),
              "cuFuncSetAttribute(MAX_DYNAMIC_SHARED_SIZE_BYTES)");
   }
 }
 
-/// Per-kernel-kind I/O arity check.  External cs_rank kernels have a
+/// Per-kernel-kind I/O arity check.  External cross-sectional kernels have a
 /// fixed signature `(T_in, T_out)` — the kernel signature is set in
-/// stone by `kernels/cs_rank.cu`, so we know the wiring is wrong (not
+/// stone by `kernels/cs_*.cu`, so we know the wiring is wrong (not
 /// just unusual) the moment we see any other shape.  Static property
 /// of the graph, so done at construction.
 static void validateKernelIO(const std::vector<KernelMeta> &kernels,
@@ -701,9 +744,12 @@ static void validateKernelIO(const std::vector<KernelMeta> &kernels,
         break;
       case KernelKind::ExtCsRankF32:
       case KernelKind::ExtCsRankF64:
+      case KernelKind::ExtCsScaleF32:
+      case KernelKind::ExtCsScaleF64:
         if (nIn != 1 || nOut != 1)
           throw std::runtime_error(
-              "kun_cuda::Executable: cs_rank kernel '" + k.kernelName +
+              "kun_cuda::Executable: external cross-sectional kernel '" +
+              k.kernelName +
               "' must have exactly 1 input and 1 output (have " +
               std::to_string(nIn) + " / " + std::to_string(nOut) + ")");
         break;
@@ -767,12 +813,13 @@ Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
 
   // ── Load cubin(s) + resolve every kernel symbol ──────────────────
   loadJitCubin(data_, cuModule_);
-  loadCsRankPtxIfNeeded(data_.kernels, csRankModule_);
+  loadExternalCsPtxIfNeeded(data_.kernels, csRankModule_, csScaleModule_);
 
   cuFuncs_.resize(data_.kernels.size(), nullptr);
   for (size_t i = 0; i < data_.kernels.size(); ++i) {
     cuFuncs_[i] = resolveOneKernelSymbol(data_.kernels[i],
-                                          cuModule_, csRankModule_);
+                                          cuModule_, csRankModule_,
+                                          csScaleModule_);
   }
 
   // ── Opt external kernels into the device's full dynamic smem cap ──
@@ -787,6 +834,8 @@ Executable::~Executable() {
     cuModuleUnload(cuModule_);
   if (csRankModule_)
     cuModuleUnload(csRankModule_);
+  if (csScaleModule_)
+    cuModuleUnload(csScaleModule_);
 }
 
 void Executable::freeSlotPool() {
@@ -915,18 +964,18 @@ void Executable::launchOnStream(
                        data_.warpsPerCta, data_.vectorSize,
                        plan.numChunks, argPtrs.data(), stream);
     } else {
-      // External cs_rank argv unchanged: (i32 T, i32 S, ptrs...).  These
-      // kernels are cross-sectional, time-major, and don't multi-chunk
-      // along time — the mask / chunk_size / warmup scalars don't apply.
+      // External cross-sectional argv: (i32 T, i32 S, ptrs...).  These
+      // kernels are time-major and don't multi-chunk along time — the
+      // mask / chunk_size / warmup scalars don't apply.
       std::vector<void *> argPtrs;
       argPtrs.reserve(2 + ptrs.size());
       argPtrs.push_back(&timeLenI32);
       argPtrs.push_back(&numStocksI32);
       for (auto &p : ptrs) argPtrs.push_back(&p);
-      launchExtCsRankKernel(cuFuncs_[kIdx], meta.kind, meta.kernelName,
-                              timeLength, numStocks,
-                              devMaxSmemBytes, smFillFactor, numSMs,
-                              argPtrs.data(), stream);
+      launchExtCsKernel(cuFuncs_[kIdx], meta.kind, meta.kernelName,
+                          timeLength, numStocks,
+                          devMaxSmemBytes, smFillFactor, numSMs,
+                          argPtrs.data(), stream);
     }
   }
 }
diff --git a/mlir/lib/KunCuda/kernels/cs_scale.cu b/mlir/lib/KunCuda/kernels/cs_scale.cu
new file mode 100644
index 0000000..647bc7b
--- /dev/null
+++ b/mlir/lib/KunCuda/kernels/cs_scale.cu
@@ -0,0 +1,110 @@
+// cs_scale.cu — cross-sectional scale kernel, pre-compiled to PTX and
+// embedded into libKunCudaRuntime as a separate CUmodule.
+//
+// Signature matches cs_rank and the executor's external-kernel launch
+// convention:
+//   (i32 time_length, i32 num_stocks, in_ptr, out_ptr)
+//
+// For each timestep:
+//   sum = Σ abs(x_i), ignoring NaNs
+//   out_i = x_i / sum
+// except all-zero valid rows follow the CPU ScaleStocks behavior and
+// produce NaN for zero inputs.
+
+#include <cuda_runtime.h>
+#include <math_constants.h>
+
+extern __shared__ unsigned char kun_cs_scale_smem[];
+
+namespace {
+
+template <typename T>
+__device__ static inline T kun_nan();
+
+template <>
+__device__ inline float kun_nan<float>() { return CUDART_NAN_F; }
+
+template <>
+__device__ inline double kun_nan<double>() { return CUDART_NAN; }
+
+template <typename T>
+__device__ static inline T kun_abs(T v);
+
+template <>
+__device__ inline float kun_abs<float>(float v) { return fabsf(v); }
+
+template <>
+__device__ inline double kun_abs<double>(double v) { return fabs(v); }
+
+template <typename T>
+__device__ static inline T warp_sum(T v) {
+#pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        v += __shfl_down_sync(0xffffffffu, v, offset);
+    }
+    return v;
+}
+
+template <typename T>
+__device__ static void cs_scale_body(const T* __restrict__ in,
+                                     T* __restrict__ out,
+                                     int time_length,
+                                     int num_stocks) {
+    int time_per_cta = (time_length + gridDim.x - 1) / gridDim.x;
+    int t0 = blockIdx.x * time_per_cta;
+    int t1 = t0 + time_per_cta;
+    if (t1 > time_length) t1 = time_length;
+    if (t0 >= t1) return;
+
+    T* smem = reinterpret_cast<T*>(kun_cs_scale_smem);
+    T* row_sum = smem + num_stocks;
+
+    for (int t = t0; t < t1; ++t) {
+        const T* row_in  = in  + static_cast<size_t>(t) * num_stocks;
+        T*       row_out = out + static_cast<size_t>(t) * num_stocks;
+
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            smem[i] = row_in[i];
+        }
+        __syncthreads();
+
+        if (threadIdx.x < 32) {
+            int lane = threadIdx.x;
+            T lane_sum = static_cast<T>(0);
+            for (int i = lane; i < num_stocks; i += 32) {
+                T v = smem[i];
+                if (!isnan(v))
+                    lane_sum += kun_abs(v);
+            }
+            T sum = warp_sum(lane_sum);
+            if (lane == 0)
+                *row_sum = sum;
+        }
+        __syncthreads();
+
+        T sum = *row_sum;
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            T v = smem[i];
+            row_out[i] = (v == static_cast<T>(0) && sum == static_cast<T>(0))
+                             ? kun_nan<T>()
+                             : v / sum;
+        }
+        __syncthreads();
+    }
+}
+
+} // anonymous namespace
+
+extern "C" __global__
+void kun_cs_scale_f32(int time_length, int num_stocks,
+                      const float* __restrict__ in,
+                      float* __restrict__ out) {
+    cs_scale_body<float>(in, out, time_length, num_stocks);
+}
+
+extern "C" __global__
+void kun_cs_scale_f64(int time_length, int num_stocks,
+                      const double* __restrict__ in,
+                      double* __restrict__ out) {
+    cs_scale_body<double>(in, out, time_length, num_stocks);
+}
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index f4099f0..ceaa02e 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -362,7 +362,8 @@ static nb::dict collectOutputs(
 /// Parse one Python `external_kernels=[...]` entry into a KernelMeta.
 /// Expected dict shape:
 ///   {"name": str, "kind": str, "inputs": [str...], "outputs": [str...]}
-/// where `kind` is one of "cs_rank_f32" / "cs_rank_f64".
+/// where `kind` is one of "cs_rank_f32", "cs_rank_f64",
+/// "cs_scale_f32", or "cs_scale_f64".
 static kun_cuda::KernelMeta parseExternalKernel(nb::handle obj) {
   nb::dict d = nb::cast<nb::dict>(obj);
   kun_cuda::KernelMeta km;
@@ -372,10 +373,15 @@ static kun_cuda::KernelMeta parseExternalKernel(nb::handle obj) {
     km.kind = kun_cuda::KernelKind::ExtCsRankF32;
   else if (kind == "cs_rank_f64")
     km.kind = kun_cuda::KernelKind::ExtCsRankF64;
+  else if (kind == "cs_scale_f32")
+    km.kind = kun_cuda::KernelKind::ExtCsScaleF32;
+  else if (kind == "cs_scale_f64")
+    km.kind = kun_cuda::KernelKind::ExtCsScaleF64;
   else
     throw std::runtime_error(
         "KunMLIR.compile: unknown external kernel kind '" + kind +
-        "' (supported: cs_rank_f32, cs_rank_f64)");
+        "' (supported: cs_rank_f32, cs_rank_f64, "
+        "cs_scale_f32, cs_scale_f64)");
   nb::iterable inputs  = nb::cast<nb::iterable>(d["inputs"]);
   nb::iterable outputs = nb::cast<nb::iterable>(d["outputs"]);
   for (nb::handle n : inputs)
diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index 3219319..bc8c41d 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -8,7 +8,7 @@
 import time
 import os
 from KunQuant.jit import cfake
-from KunQuant.Op import Builder, Input, Output, Scale
+from KunQuant.Op import Builder, Input, Output
 from KunQuant.Stage import Function
 from KunQuant.predefined.Alpha101 import AllData, all_alpha
 from KunQuant.runner import KunRunner as kr
@@ -32,36 +32,6 @@
     cp.cuda.Device(0).use()
     cp.zeros((1,), dtype=cp.float32)
 
-
-_GPU_SKIP_DEP_TYPES = (Scale,)
-
-
-def _depends_on_type(op, dep_types, seen=None):
-    if seen is None:
-        seen = set()
-    if op in seen:
-        return False
-    seen.add(op)
-    if isinstance(op, dep_types):
-        return True
-    return any(_depends_on_type(inp, dep_types, seen) for inp in op.inputs)
-
-
-def _filter_outputs_for_gpu(f: Function) -> None:
-    kept = []
-    dropped = []
-    for op in f.ops:
-        if isinstance(op, Output) and _depends_on_type(op.inputs[0],
-                                                       _GPU_SKIP_DEP_TYPES):
-            dropped.append(op.attrs["name"])
-            continue
-        kept.append(op)
-    if dropped:
-        print(f"[gpu] dropping {len(dropped)} unsupported outputs: "
-              f"{dropped}")
-    f.set_ops(kept)
-
-
 def get_simd_len(avx: str, dtype: str = "float"):
     element_width = 32 if dtype == "float" else 64
     if avx == "avx512":
@@ -552,7 +522,6 @@ def do_compile(avx, keep, tempdir):
         for name, f, kcfg in funclist:
             if name == "alpha_101_stream":
                 continue
-            _filter_outputs_for_gpu(f)
             kcfg = dataclasses.replace(kcfg, input_layout="TS",
                                        output_layout="TS",
                                        blocking_len=1)

From 3092e15a05ae65e801e81d10601e66d90b035763 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Thu, 21 May 2026 20:35:09 -0700
Subject: [PATCH 42/59] alpha101 benchmark

---
 tests/test_alpha101.py | 62 +++++++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index bc8c41d..796dbfc 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -19,11 +19,19 @@
 _argp = argparse.ArgumentParser(add_help=False)
 _argp.add_argument("action", nargs="?")
 _argp.add_argument("--gpu-arch", default="")
+_argp.add_argument("--benchmode", action="store_true")
+_argp.add_argument("--time", type=int, default=260)
+_argp.add_argument("--num-stocks", type=int, default=64)
+_argp.add_argument("--num-threads", type=int, default=4)
+
 _args, _ = _argp.parse_known_args()
 action = _args.action or ("run_gpu" if _args.gpu_arch else "avx2")
 GPU_ARCH = _args.gpu_arch or ("sm_80" if action == "run_gpu" else "")
 GPU_MODE = bool(GPU_ARCH)
-
+BENCHMODE = _args.benchmode
+TIME = _args.time
+NUM_STOCKS = _args.num_stocks
+NUM_THREADS = _args.num_threads
 if GPU_MODE:
     import cupy as cp
     from KunQuant.jit import KunMLIR as _kr_mlir
@@ -160,16 +168,20 @@ def create_single_thread_executor():
 def create_multi_thread_executor(n):
     return _kr_mlir.Executor() if GPU_MODE else kr.createMultiThreadExecutor(n)
 
-
-def run_graph(executor, modu, inputs, cur_time, length, outputs=None, **kwargs):
+gpu_inputs = None
+def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None, **kwargs):
     if not GPU_MODE:
         return kr.runGraph(executor, modu, inputs, cur_time, length,
                            outputs if outputs is not None else {}, **kwargs)
     if cur_time != 0:
         raise RuntimeError("GPU alpha101 test only supports cur_time=0")
-    gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
+    global gpu_inputs
+    if not benchmode:
+        gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
     ret = executor.runGraph(modu, gpu_inputs, cur_time=cur_time,
                             length=length)
+    if benchmode:
+        return ret
     executor.synchronize()
 
     out_np = {}
@@ -369,18 +381,17 @@ def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ische
     # blocked = TS_STs(inp)
     
     if not ischeck:
-        out = run_graph(executor, modu, my_input, start_time,
+        out = run_graph(executor, False, modu, my_input, start_time,
                         num_time-start_time, outbuffers)
         start = time.time()
         for _ in range(20):
-            out = run_graph(executor, modu, my_input, start_time,
-                            num_time-start_time, outbuffers,
-                            skip_check=True)
+            out = run_graph(executor, True, modu, my_input, start_time,
+                            num_time-start_time, outbuffers)
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
-        out = run_graph(executor, modu, my_input, start_time,
+        out = run_graph(executor, False, modu, my_input, start_time,
                         num_time-start_time, outbuffers,
                         num_stocks=num_stock)
         end = time.time()
@@ -453,18 +464,19 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
     # print(ref.alpha001())
     # blocked = TS_STs(inp)
     if not ischeck:
-        out = run_graph(executor, modu, my_input, start_time,
+        out = run_graph(executor, False, modu, my_input, start_time,
                         num_time-start_time, outbuffers)
         start = time.time()
         for _ in range(20):
-            out = run_graph(executor, modu, my_input, start_time,
-                            num_time-start_time, outbuffers,
-                            skip_check=True)
+            out = run_graph(executor, True, modu, my_input, start_time,
+                            num_time-start_time, outbuffers)
+        if GPU_MODE:
+            executor.synchronize()
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
-        out = run_graph(executor, modu, my_input, start_time,
+        out = run_graph(executor, False, modu, my_input, start_time,
                         num_time-start_time, outbuffers)
         end = time.time()
         tdiff = end-start
@@ -482,7 +494,7 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
 def main(is64: bool, is_check: bool):
     modu = lib.getModule("alpha_101" if not is64 else "alpha_101_double")
     start_window = modu.getOutputUnreliableCount()
-    num_stock = 64
+    num_stock = NUM_STOCKS
     done = True
     testfunc = test64 if is64 else test
     blocking_num = 1 if GPU_MODE else modu.blocking_len
@@ -490,15 +502,15 @@ def main(is64: bool, is_check: bool):
     blocking = 0 if is64 or GPU_MODE else blocking_num
     def compute():
         nonlocal done
-        num_time = 260
+        num_time = TIME
         my_input, pd_ref = make_data_and_ref(num_stock, num_time, is_check, blocking, "float64" if is64 else "float32")
-        executor = create_single_thread_executor()
-        done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
-        done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
         if not GPU_MODE:
-            executor = create_multi_thread_executor(4)
+            executor = create_single_thread_executor()
             done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
-    num_stock = 64
+            if not BENCHMODE:
+                done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
+        executor = create_multi_thread_executor(NUM_THREADS)
+        done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
     compute()
     # skip benchmarking on unaligned mode
     if not is_check:
@@ -542,15 +554,15 @@ def do_compile(avx, keep, tempdir):
     lib = do_compile(action, False, None)
 
 print("Check f64 batch")
-main(True, True)
+main(True, not BENCHMODE)
 print("======================================")
 print("Check f32 batch")
-main(False, True)
-if not GPU_MODE:
+main(False, not BENCHMODE)
+if not GPU_MODE and not BENCHMODE:
     print("======================================")
     print("Check f32 stream")
     streammain(64)
-if not GPU_MODE and action != "run_avx512" and isx86:
+if not GPU_MODE and action != "run_avx512" and isx86 and not BENCHMODE:
     print("======================================")
     print("Check f32 stream unaligned")
     streammain(63)

From 8e7d36197d887d39536090d8d5035e2612976bc0 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Fri, 22 May 2026 02:38:18 -0700
Subject: [PATCH 43/59] cuda graph

---
 mlir/include/KunCuda/Runtime.h        |  33 +-
 mlir/lib/KunCuda/CMakeLists.txt       |   5 +-
 mlir/lib/KunCuda/Runtime.cpp          | 518 +++++++++++++++++---------
 mlir/lib/KunCuda/RuntimeCudaGraph.cpp | 375 +++++++++++++++++++
 mlir/lib/KunCuda/RuntimeUtil.h        | 153 ++++++++
 mlir/lib/Python/MlirBinding.cpp       |  11 +-
 mlir/test/python/test_multi_kernel.py |   9 +-
 tests/test_alpha101.py                |   7 +-
 8 files changed, 930 insertions(+), 181 deletions(-)
 create mode 100644 mlir/lib/KunCuda/RuntimeCudaGraph.cpp
 create mode 100644 mlir/lib/KunCuda/RuntimeUtil.h

diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index d5d8ea8..1278f6e 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -54,6 +54,10 @@ namespace kun_cuda {
 /// producer maps, etc.  Fully defined in Runtime.cpp.
 struct GraphPlan;
 
+/// Internal: CUDA Graph mode state.  Kept behind a pointer because normal
+/// launch mode does not need any graph objects.
+struct CudaGraphLaunchState;
+
 /// Forward-declared so `Executable::launchOnStream` can take an
 /// `Executor *` argument; the full definition lives below.
 class Executor;
@@ -85,6 +89,14 @@ enum class Datatype : int32_t {
   Double = 1,   ///< f64 — 8 bytes/elem
 };
 
+/// Runtime launch backend.  Normal queues kernels one by one on the stream.
+/// CudaGraph builds a CUDA Graph node DAG and uses graph memory allocation
+/// nodes for intermediate buffers.
+enum class LaunchMode : int32_t {
+  Normal    = 0,
+  CudaGraph = 1,
+};
+
 inline size_t bytesPerElem(Datatype dt) noexcept {
   return dt == Datatype::Double ? 8u : 4u;
 }
@@ -201,8 +213,10 @@ class Executable {
   ///
   /// `args` keys must equal `graphInputs ++ graphOutputs` (order
   /// doesn't matter; the runtime hashes them into the buffer table).
-  /// Intermediate buffers are owned by the executable and reused across
-  /// launches with matching `(timeLength, numStocks)`.
+  /// In normal mode, intermediate buffers are owned by the executable and
+  /// reused across launches with matching `(timeLength, numStocks)`.  In CUDA
+  /// Graph mode, intermediates are graph allocation nodes with free nodes after
+  /// their last consumers.
   ///
   /// Grid configuration (per kernel — identical because warps_per_cta
   /// and vector_size are graph-wide):
@@ -242,7 +256,8 @@ class Executable {
                        const std::vector<std::pair<std::string, uintptr_t>> &args,
                        int64_t mask = 0,
                        int minChunkWarmupFactor = 4,
-                       double smFillFactor = 1.5);
+                       double smFillFactor = 1.5,
+                       LaunchMode mode = LaunchMode::Normal);
 
 private:
   /// Allocate (or re-allocate, if shape changed) the intermediate slot
@@ -250,9 +265,18 @@ class Executable {
   void ensureSlotPool(int64_t timeLength, int64_t numStocks);
   /// Free all slot allocations.  Called from dtor and on shape change.
   void freeSlotPool();
+  void launchCudaGraphOnStream(
+      Executor *exec,
+      int64_t timeLength, int64_t numStocks,
+      const std::vector<std::pair<std::string, uintptr_t>> &args,
+      int64_t mask,
+      int minChunkWarmupFactor,
+      double smFillFactor);
+  void resetCudaGraphState() noexcept;
 
   ExecutableData data_;
   std::unique_ptr<GraphPlan> plan_;          ///< pImpl — defined in Runtime.cpp
+  std::unique_ptr<CudaGraphLaunchState> cudaGraphState_;
 
   CUmodule cuModule_ = nullptr;
   /// Modules holding pre-compiled cross-sectional PTX.  Loaded at
@@ -317,7 +341,8 @@ class Executor {
                 const std::vector<std::pair<std::string, uintptr_t>> &args,
                 int64_t mask = 0,
                 int minChunkWarmupFactor = 4,
-                double smFillFactor = 1.5);
+                double smFillFactor = 1.5,
+                LaunchMode mode = LaunchMode::Normal);
 
   /// Block until all queued work on this stream completes.
   void synchronize();
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 2140fe3..9a3d8a7 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -93,7 +93,10 @@ foreach(_cu IN LISTS _kun_cu_sources)
   kun_add_bundled_ptx_kernel("${_cu}")
 endforeach()
 
-add_library(KunCudaRuntime SHARED Runtime.cpp ${_kun_ptx_embed_includes})
+add_library(KunCudaRuntime SHARED
+    Runtime.cpp
+    RuntimeCudaGraph.cpp
+    ${_kun_ptx_embed_includes})
 
 # Project-wide compile flags set -fvisibility=hidden + inlines-hidden to
 # minimise the size of MLIR static libs.  This shared runtime needs to
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index 081739d..dfeb071 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -11,13 +11,14 @@
 //   topoSort             — Kahn's algorithm; rejects cycles
 //   planSlots            — refcount + LIFO free pool over the topo order
 //
-// All helpers live in this file's anonymous namespace.  Future
-// CUDA-graph support reuses the same plan: `kernelInputBufs` +
-// `producerKernel` are exactly the dep edges cuGraph needs.
+// The shared launch helpers live behind RuntimeUtil.h so the traditional
+// launcher and RuntimeCudaGraph.cpp use the same validation, buffer-pointer
+// resolution, chunk planning, and kernel argument construction.
 //
 //===----------------------------------------------------------------------===//
 
 #include "KunCuda/Runtime.h"
+#include "RuntimeUtil.h"
 
 #include <cuda.h>
 
@@ -27,6 +28,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <unordered_map>
+#include <utility>
 
 // Pre-compiled cross-sectional PTX, embedded by EmbedFile.cmake.
 #include "cs_rank_ptx.inc"
@@ -34,45 +36,6 @@
 
 namespace kun_cuda {
 
-//===----------------------------------------------------------------------===//
-// GraphPlan — pImpl payload, hidden from the public header
-//===----------------------------------------------------------------------===//
-
-/// Runtime-resolved schedule + memory plan.  All buffer references here
-/// are integer indices into the flat buffer table.  Storing
-/// `producerKernel` makes it cheap to re-derive kernel-to-kernel
-/// dependency edges (needed for future cuGraph support: kernel K's deps
-/// = {producerKernel[b] for b in kernelInputBufs[K], filtered to ≥ 0}).
-struct GraphPlan {
-  int numBuffers       = 0;
-  int numGraphInputs   = 0;
-  int numGraphOutputs  = 0;
-
-  // Name → index for the user-facing args dict.  Other lookups happen
-  // by integer indexing.
-  std::unordered_map<std::string, int> graphInputIdx;
-  std::unordered_map<std::string, int> graphOutputIdx;
-
-  // Per-kernel I/O resolved to buffer indices.  Parallel to ExecutableData::kernels.
-  std::vector<std::vector<int>> kernelInputBufs;
-  std::vector<std::vector<int>> kernelOutputBufs;
-
-  // producerKernel[bufIdx] = kernel that writes the buffer, or -1 if
-  // the buffer is a graph input.
-  std::vector<int> producerKernel;
-
-  // Topo order — a single valid linearization for the v0 single-stream
-  // launcher.
-  std::vector<int> launchOrder;
-
-  // Slot assignment: one entry per buffer index.  -1 if the buffer is a
-  // graph input/output; otherwise a slot index in [0, peakIntermediateSlots).
-  std::vector<int> intermediateBufToSlot;
-  int peakIntermediateSlots = 0;
-};
-
-namespace {
-
 //===----------------------------------------------------------------------===//
 // CUDA driver helpers
 //===----------------------------------------------------------------------===//
@@ -95,6 +58,8 @@ std::string joinNames(const std::vector<std::string> &v) {
   return r;
 }
 
+namespace {
+
 //===----------------------------------------------------------------------===//
 // Plan-building helpers — small POD intermediates so each helper is
 // independent and trivially testable.
@@ -353,20 +318,125 @@ SlotPlan planSlots(const std::vector<int> &launchOrder,
   return plan;
 }
 
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // Launch helpers — pure functions used by launchOnStream below.
 //===----------------------------------------------------------------------===//
 
-/// Translate the user-supplied {name → device_ptr} args dict into a
-/// flat buffer-index → pointer array, plug in the executable-owned
-/// intermediate-slot pointers, and verify every graph_input /
-/// graph_output the plan expects was provided.  Throws on unknown or
-/// missing names.
-static std::vector<uintptr_t> resolveBufferPointers(
+KernelLaunchDesc::KernelLaunchDesc(
+    int kernelIndex, KernelKind kind, CUfunction fn, bool isKernelNode,
+    int32_t timeLenI32, int32_t numStocksI32, int32_t maskI32,
+    int32_t chunkSizeI32, int32_t warmupI32,
+    std::vector<CUdeviceptr> ptrs)
+    : kernelIndex(kernelIndex),
+      kind(kind),
+      isKernelNode(isKernelNode),
+      timeLenI32(timeLenI32),
+      numStocksI32(numStocksI32),
+      maskI32(maskI32),
+      chunkSizeI32(chunkSizeI32),
+      warmupI32(warmupI32),
+      ptrs_(std::move(ptrs)) {
+  params = {};
+  params.func = fn;
+  params.gridDimX = 1;
+  params.gridDimY = 1;
+  params.gridDimZ = 1;
+  params.blockDimX = 1;
+  params.blockDimY = 1;
+  params.blockDimZ = 1;
+  params.sharedMemBytes = 0;
+  rebuildKernelParamPointers();
+}
+
+KernelLaunchDesc::KernelLaunchDesc(KernelLaunchDesc &&other) noexcept
+    : kernelIndex(other.kernelIndex),
+      kind(other.kind),
+      isKernelNode(other.isKernelNode),
+      timeLenI32(other.timeLenI32),
+      numStocksI32(other.numStocksI32),
+      maskI32(other.maskI32),
+      chunkSizeI32(other.chunkSizeI32),
+      warmupI32(other.warmupI32),
+      params(other.params),
+      ptrs_(std::move(other.ptrs_)) {
+  rebuildKernelParamPointers();
+}
+
+KernelLaunchDesc &
+KernelLaunchDesc::operator=(KernelLaunchDesc &&other) noexcept {
+  if (this == &other)
+    return *this;
+  kernelIndex = other.kernelIndex;
+  kind = other.kind;
+  isKernelNode = other.isKernelNode;
+  timeLenI32 = other.timeLenI32;
+  numStocksI32 = other.numStocksI32;
+  maskI32 = other.maskI32;
+  chunkSizeI32 = other.chunkSizeI32;
+  warmupI32 = other.warmupI32;
+  params = other.params;
+  ptrs_ = std::move(other.ptrs_);
+  rebuildKernelParamPointers();
+  return *this;
+}
+
+void KernelLaunchDesc::rebuildKernelParamPointers() {
+  argPtrs_.clear();
+  params.kernelParams = nullptr;
+  params.extra = nullptr;
+  if (!isKernelNode)
+    return;
+
+  if (kind == KernelKind::Jit) {
+    argPtrs_.reserve(5 + ptrs_.size());
+    argPtrs_.push_back(&timeLenI32);
+    argPtrs_.push_back(&numStocksI32);
+    argPtrs_.push_back(&maskI32);
+    argPtrs_.push_back(&chunkSizeI32);
+    argPtrs_.push_back(&warmupI32);
+  } else {
+    argPtrs_.reserve(2 + ptrs_.size());
+    argPtrs_.push_back(&timeLenI32);
+    argPtrs_.push_back(&numStocksI32);
+  }
+  for (auto &p : ptrs_)
+    argPtrs_.push_back(&p);
+  params.kernelParams = argPtrs_.data();
+}
+
+int firstIntermediateBuffer(const GraphPlan &plan) noexcept {
+  return plan.numGraphInputs + plan.numGraphOutputs;
+}
+
+void validateLaunchInputs(const ExecutableData &data,
+                          int64_t timeLength, int64_t numStocks,
+                          int64_t mask) {
+  if (timeLength > std::numeric_limits<int32_t>::max() ||
+      numStocks  > std::numeric_limits<int32_t>::max() ||
+      timeLength < 0 || numStocks < 0)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: time_length / num_stocks out of i32 "
+        "range (kernel signature uses i32, i32)");
+  if (mask < 0 || (timeLength > 0 && mask >= timeLength))
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: mask must be in [0, time_length), got "
+        + std::to_string(mask) + " for time_length="
+        + std::to_string(timeLength));
+  if (data.warpsPerCta <= 0)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: warps_per_cta is " +
+        std::to_string(data.warpsPerCta));
+}
+
+/// Translate the user-supplied {name → device_ptr} args dict into a flat
+/// buffer-index → pointer array for graph inputs/outputs.  Intermediate slots
+/// are left as 0 and filled by the caller.
+std::vector<uintptr_t> resolveExternalBufferPointers(
     const GraphPlan &plan,
     const ExecutableData &data,
-    const std::vector<std::pair<std::string, uintptr_t>> &args,
-    const std::vector<uintptr_t> &slotBufs) {
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
   std::vector<uintptr_t> bufPtrs(plan.numBuffers, 0);
   std::vector<bool>      filled(plan.numBuffers, false);
 
@@ -398,35 +468,53 @@ static std::vector<uintptr_t> resolveBufferPointers(
         "kun_cuda::launchOnStream: missing argument '" + missing + "'");
   }
 
+  return bufPtrs;
+}
+
+/// Translate the user-supplied {name → device_ptr} args dict into a
+/// flat buffer-index → pointer array, plug in the executable-owned
+/// intermediate-slot pointers, and verify every graph_input /
+/// graph_output the plan expects was provided.  Throws on unknown or
+/// missing names.
+std::vector<uintptr_t> resolveBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    const std::vector<uintptr_t> &slotBufs) {
+  std::vector<uintptr_t> bufPtrs =
+      resolveExternalBufferPointers(plan, data, args);
+
   // Intermediates: index into the pre-allocated slot pool.
-  for (int i = plan.numGraphInputs + plan.numGraphOutputs;
-        i < plan.numBuffers; ++i) {
+  for (int i = firstIntermediateBuffer(plan); i < plan.numBuffers; ++i) {
     int slot = plan.intermediateBufToSlot[i];
     bufPtrs[i] = slotBufs[slot];
   }
   return bufPtrs;
 }
 
-/// Stock-major × time-chunk launch: block_x = warps_per_cta*32,
-/// grid_x = ceil(numStocks / (block_x * vector_size)),
-/// grid_y = numChunks, no dynamic smem.
-static void launchJitKernel(CUfunction fn,
-                              int64_t numStocks,
-                              int64_t warpsPerCta, int64_t vectorSize,
-                              unsigned numChunks,
-                              void **args, CUstream stream) {
-  unsigned blockX = static_cast<unsigned>(warpsPerCta * 32);
+static void resetLaunchShape(KernelLaunchDesc &desc) {
+  desc.params.gridDimX = 1;
+  desc.params.gridDimY = 1;
+  desc.params.gridDimZ = 1;
+  desc.params.blockDimX = 1;
+  desc.params.blockDimY = 1;
+  desc.params.blockDimZ = 1;
+  desc.params.sharedMemBytes = 0;
+}
+
+static void computeJitLaunchShape(KernelLaunchDesc &desc,
+                                  int64_t numStocks,
+                                  int64_t warpsPerCta,
+                                  int64_t vectorSize,
+                                  unsigned numChunks) {
+  desc.params.blockDimX = static_cast<unsigned>(warpsPerCta * 32);
   uint64_t stocksPerBlock =
-      static_cast<uint64_t>(blockX) * static_cast<uint64_t>(vectorSize);
-  unsigned gridX = static_cast<unsigned>(
+      static_cast<uint64_t>(desc.params.blockDimX) *
+      static_cast<uint64_t>(vectorSize);
+  desc.params.gridDimX = static_cast<unsigned>(
       (static_cast<uint64_t>(numStocks) + stocksPerBlock - 1) /
       stocksPerBlock);
-  // sharedMemBytes = 0 — JIT'd kernels declare static smem via
-  // llvm.mlir.global addr_space=3; the dynamic-smem launch parameter
-  // does not apply.
-  checkCu(cuLaunchKernel(fn, gridX, numChunks, 1, blockX, 1, 1,
-                           /*sharedMemBytes=*/0, stream, args, nullptr),
-           "cuLaunchKernel");
+  desc.params.gridDimY = numChunks;
 }
 
 /// Chunk plan for a single JIT kernel.  `chunkSize` is the time-axis
@@ -454,15 +542,11 @@ static void launchJitKernel(CUfunction fn,
 /// When both unreliable == 0 and mask == 0, the only cap is T itself.
 /// When numSMs == 0 (Executor couldn't query the device) or
 /// smFillFactor ≤ 0, fall back to single-chunk.
-struct ChunkPlan {
-  int64_t chunkSize;
-  unsigned numChunks;
-};
-static ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
-                                     int64_t warpsPerCta, int64_t vectorSize,
-                                     int64_t unreliableCount, int64_t mask,
-                                     int minChunkWarmupFactor,
-                                     double smFillFactor, int numSMs) {
+ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
+                           int64_t warpsPerCta, int64_t vectorSize,
+                           int64_t unreliableCount, int64_t mask,
+                           int minChunkWarmupFactor,
+                           double smFillFactor, int numSMs) {
   if (timeLength <= 0)
     return {timeLength, 1u};
   if (numSMs <= 0 || smFillFactor <= 0.0)
@@ -550,12 +634,14 @@ static bool isCsScaleKind(KernelKind kind) {
          kind == KernelKind::ExtCsScaleF64;
 }
 
-static void launchExtCsKernel(CUfunction fn, KernelKind kind,
-                                const std::string &kernelName,
-                                int64_t timeLength, int64_t numStocks,
-                                int devMaxSmemBytes,
-                                double smFillFactor, int numSMs,
-                                void **args, CUstream stream) {
+static void computeExtCsLaunchShape(KernelLaunchDesc &desc,
+                                    KernelKind kind,
+                                    const std::string &kernelName,
+                                    int64_t timeLength,
+                                    int64_t numStocks,
+                                    int devMaxSmemBytes,
+                                    double smFillFactor,
+                                    int numSMs) {
   size_t elemSize = isF64ExternalKind(kind) ? 8u : 4u;
   uint64_t smemElems = static_cast<uint64_t>(numStocks);
   if (isCsScaleKind(kind))
@@ -587,26 +673,171 @@ static void launchExtCsKernel(CUfunction fn, KernelKind kind,
   int64_t blockX64 =
       ((std::max<int64_t>(numStocks, 1) + kWarp - 1) / kWarp) * kWarp;
   if (blockX64 > kMaxBlock) blockX64 = kMaxBlock;
-  unsigned blockX = static_cast<unsigned>(blockX64);
+  desc.params.blockDimX = static_cast<unsigned>(blockX64);
 
   // Target gridX = sm_fill_factor * numSMs (capped at timeLength so we
   // never launch idle CTAs).  numSMs == 0 (device query failed) →
   // gridX = timeLength, one CTA per timestep.
-  unsigned gridX;
   if (numSMs > 0 && smFillFactor > 0.0) {
     int64_t target = static_cast<int64_t>(
         std::ceil(smFillFactor * static_cast<double>(numSMs)));
     if (target < 1) target = 1;
     if (target > timeLength) target = timeLength;
-    gridX = static_cast<unsigned>(target);
+    desc.params.gridDimX = static_cast<unsigned>(target);
   } else {
-    gridX = static_cast<unsigned>(timeLength);
+    desc.params.gridDimX = static_cast<unsigned>(timeLength);
+  }
+  desc.params.sharedMemBytes = static_cast<unsigned>(smemBytes64);
+}
+
+static std::pair<bool, bool>
+updateKernelArgPtrs(std::vector<CUdeviceptr> &ptrs,
+                    const std::vector<int> &ins,
+                    const std::vector<int> &outs,
+                    const std::vector<uintptr_t> &bufPtrs) {
+  const size_t numPtrs = ins.size() + outs.size();
+  const bool sizeChanged = ptrs.size() != numPtrs;
+  bool changed = sizeChanged;
+  if (!changed) {
+    size_t argIdx = 0;
+    for (int b : ins) {
+      if (ptrs[argIdx++] != static_cast<CUdeviceptr>(bufPtrs[b])) {
+        changed = true;
+        break;
+      }
+    }
+    if (!changed) {
+      for (int b : outs) {
+        if (ptrs[argIdx++] != static_cast<CUdeviceptr>(bufPtrs[b])) {
+          changed = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!changed)
+    return {};
+
+  ptrs.resize(numPtrs);
+  size_t argIdx = 0;
+  for (int b : ins)
+    ptrs[argIdx++] = static_cast<CUdeviceptr>(bufPtrs[b]);
+  for (int b : outs)
+    ptrs[argIdx++] = static_cast<CUdeviceptr>(bufPtrs[b]);
+  return {true, sizeChanged};
+}
+
+bool KernelLaunchDesc::updateBuffer(
+    const GraphPlan &plan,
+    int kIdx,
+    const std::vector<uintptr_t> &bufPtrs) {
+  const auto &ins = plan.kernelInputBufs[kIdx];
+  const auto &outs = plan.kernelOutputBufs[kIdx];
+  const size_t numPtrs = ins.size() + outs.size();
+  if (ptrs_.size() != numPtrs)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream(cuda_graph): kernel buffer argument count "
+        "changed without graph rebuild");
+  auto [changed, reallocated] =
+      updateKernelArgPtrs(ptrs_, ins, outs, bufPtrs);
+  (void)reallocated;
+  return changed;
+}
+
+void KernelLaunchDesc::update(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<CUfunction> &cuFuncs,
+    int kIdx,
+    const CudaGraphLaunchParams &launch) {
+  const auto &ins  = plan.kernelInputBufs[kIdx];
+  const auto &outs = plan.kernelOutputBufs[kIdx];
+  const auto &meta = data.kernels[kIdx];
+
+  kernelIndex = kIdx;
+  kind = meta.kind;
+  isKernelNode = meta.kind == KernelKind::Jit || launch.timeLength > 0;
+  timeLenI32 = static_cast<int32_t>(launch.timeLength);
+  numStocksI32 = static_cast<int32_t>(launch.numStocks);
+  maskI32 = static_cast<int32_t>(launch.mask);
+  params.func = cuFuncs[kIdx];
+  auto [changed, reallocated] =
+      updateKernelArgPtrs(ptrs_, ins, outs, launch.bufPtrs);
+  (void)changed;
+
+  unsigned numChunks = 1;
+  chunkSizeI32 = 0;
+  warmupI32 = 0;
+  if (meta.kind == KernelKind::Jit) {
+    ChunkPlan cp = computeChunkPlan(
+        launch.timeLength, launch.numStocks, data.warpsPerCta,
+        data.vectorSize, meta.unreliableCount, launch.mask,
+        launch.minChunkWarmupFactor, launch.smFillFactor, launch.numSMs);
+    numChunks = cp.numChunks;
+    chunkSizeI32 = static_cast<int32_t>(cp.chunkSize);
+    warmupI32 = static_cast<int32_t>(
+        std::max<int64_t>(meta.unreliableCount, 0));
+  }
+
+  resetLaunchShape(*this);
+  if (meta.kind == KernelKind::Jit) {
+    computeJitLaunchShape(*this, launch.numStocks, data.warpsPerCta,
+                          data.vectorSize, numChunks);
+  } else if (isKernelNode) {
+    computeExtCsLaunchShape(*this, meta.kind, meta.kernelName,
+                            launch.timeLength, launch.numStocks,
+                            launch.devMaxSmemBytes,
+                            launch.smFillFactor, launch.numSMs);
+  }
+
+  if (reallocated)
+    rebuildKernelParamPointers();
+}
+
+static std::vector<KernelLaunchDesc> buildKernelLaunchDescs(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<CUfunction> &cuFuncs,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<uintptr_t> &bufPtrs,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor,
+    int devMaxSmemBytes,
+    int numSMs) {
+  CudaGraphLaunchParams launch;
+  launch.timeLength = timeLength;
+  launch.numStocks = numStocks;
+  launch.mask = mask;
+  launch.minChunkWarmupFactor = minChunkWarmupFactor;
+  launch.smFillFactor = smFillFactor;
+  launch.devMaxSmemBytes = devMaxSmemBytes;
+  launch.numSMs = numSMs;
+  launch.bufPtrs = bufPtrs;
+
+  std::vector<KernelLaunchDesc> descs;
+  descs.reserve(plan.launchOrder.size());
+  for (int kIdx : plan.launchOrder) {
+    KernelLaunchDesc desc;
+    desc.update(plan, data, cuFuncs, kIdx, launch);
+    descs.emplace_back(std::move(desc));
   }
+  return descs;
+}
 
-  unsigned smemBytes = static_cast<unsigned>(smemBytes64);
-  checkCu(cuLaunchKernel(fn, gridX, 1, 1, blockX, 1, 1,
-                           smemBytes, stream, args, nullptr),
-           "cuLaunchKernel(external_cs)");
+void launchKernelDesc(const KernelLaunchDesc &desc, CUstream stream) {
+  if (!desc.isKernelNode)
+    return;
+  const char *what = desc.kind == KernelKind::Jit
+                         ? "cuLaunchKernel"
+                         : "cuLaunchKernel(external_cs)";
+  const CUDA_KERNEL_NODE_PARAMS &p = desc.params;
+  checkCu(cuLaunchKernel(p.func,
+                         p.gridDimX, p.gridDimY, p.gridDimZ,
+                         p.blockDimX, p.blockDimY, p.blockDimZ,
+                         p.sharedMemBytes, stream,
+                         p.kernelParams, p.extra),
+          what);
 }
 
 //===----------------------------------------------------------------------===//
@@ -614,6 +845,8 @@ static void launchExtCsKernel(CUfunction fn, KernelKind kind,
 // the CUmodule and CUfunction handles the ctor is populating.
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 /// Load the JIT'd cubin if non-empty; otherwise sanity-check that no
 /// kernel actually needs it (every `kind == Jit` requires a cubin).
 static void loadJitCubin(const ExecutableData &data, CUmodule &outModule) {
@@ -829,6 +1062,7 @@ Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
 Executable::~Executable() {
   // Best-effort cleanup; we deliberately don't propagate driver errors
   // out of a destructor.
+  resetCudaGraphState();
   freeSlotPool();
   if (cuModule_)
     cuModuleUnload(cuModule_);
@@ -887,29 +1121,23 @@ void Executable::launchOnStream(
     const std::vector<std::pair<std::string, uintptr_t>> &args,
     int64_t mask,
     int minChunkWarmupFactor,
-    double smFillFactor) {
+    double smFillFactor,
+    LaunchMode mode) {
   if (!exec)
     throw std::runtime_error(
         "kun_cuda::launchOnStream: Executor pointer is null");
+
+  validateLaunchInputs(data_, timeLength, numStocks, mask);
+
+  if (mode == LaunchMode::CudaGraph) {
+    launchCudaGraphOnStream(exec, timeLength, numStocks, args,
+                            mask, minChunkWarmupFactor, smFillFactor);
+    return;
+  }
+
   CUstream stream      = exec->stream();
   int devMaxSmemBytes  = exec->devMaxSmemBytes();
   int numSMs           = exec->numSMs();
-  // ── Shape sanity (kernel signature is i32 across the board) ─────
-  if (timeLength > std::numeric_limits<int32_t>::max() ||
-      numStocks  > std::numeric_limits<int32_t>::max() ||
-      timeLength < 0 || numStocks < 0)
-    throw std::runtime_error(
-        "kun_cuda::launchOnStream: time_length / num_stocks out of i32 "
-        "range (kernel signature uses i32, i32)");
-  if (mask < 0 || (timeLength > 0 && mask >= timeLength))
-    throw std::runtime_error(
-        "kun_cuda::launchOnStream: mask must be in [0, time_length), got "
-        + std::to_string(mask) + " for time_length="
-        + std::to_string(timeLength));
-  if (data_.warpsPerCta <= 0)
-    throw std::runtime_error(
-        "kun_cuda::launchOnStream: warps_per_cta is " +
-        std::to_string(data_.warpsPerCta));
 
   // ── Grow / reuse the intermediate slot pool for this shape ───────
   ensureSlotPool(timeLength, numStocks);
@@ -918,66 +1146,13 @@ void Executable::launchOnStream(
   const std::vector<uintptr_t> bufPtrs =
       resolveBufferPointers(*plan_, data_, args, slotBufs_);
 
-  // ── Per-launch i32 scalars.  time_length / num_stocks / mask are
-  //    shared across every kernel; chunk_size / warmup vary per kernel
-  //    (chunk_size is derived from per-kernel unreliableCount). ──────
-  int32_t timeLenI32   = static_cast<int32_t>(timeLength);
-  int32_t numStocksI32 = static_cast<int32_t>(numStocks);
-  int32_t maskI32      = static_cast<int32_t>(mask);
-
-  for (int kIdx : plan_->launchOrder) {
-    const auto &ins  = plan_->kernelInputBufs[kIdx];
-    const auto &outs = plan_->kernelOutputBufs[kIdx];
-    const auto &meta = data_.kernels[kIdx];
-
-    std::vector<CUdeviceptr> ptrs;
-    ptrs.reserve(ins.size() + outs.size());
-    for (int b : ins)  ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
-    for (int b : outs) ptrs.push_back(static_cast<CUdeviceptr>(bufPtrs[b]));
-
-    if (meta.kind == KernelKind::Jit) {
-      // JIT argv: (i32 T, i32 S, i32 mask, i32 chunk_size, i32 warmup,
-      //            ptrs...).  Chunk plan is per-kernel because each
-      //            kernel has its own unreliableCount.
-      ChunkPlan plan = computeChunkPlan(
-          timeLength, numStocks, data_.warpsPerCta, data_.vectorSize,
-          meta.unreliableCount, mask, minChunkWarmupFactor,
-          smFillFactor, numSMs);
-      int32_t chunkSizeI32 = static_cast<int32_t>(plan.chunkSize);
-      // -1 sentinel (whole-time) means single chunk; the kernel's
-      // chunk-0 branch never reads the warmup arg in that case, but we
-      // still clamp to 0 so a stray load never observes a negative
-      // value in any future path.
-      int32_t warmupI32    = static_cast<int32_t>(
-          std::max<int64_t>(meta.unreliableCount, 0));
-
-      std::vector<void *> argPtrs;
-      argPtrs.reserve(5 + ptrs.size());
-      argPtrs.push_back(&timeLenI32);
-      argPtrs.push_back(&numStocksI32);
-      argPtrs.push_back(&maskI32);
-      argPtrs.push_back(&chunkSizeI32);
-      argPtrs.push_back(&warmupI32);
-      for (auto &p : ptrs) argPtrs.push_back(&p);
-
-      launchJitKernel(cuFuncs_[kIdx], numStocks,
-                       data_.warpsPerCta, data_.vectorSize,
-                       plan.numChunks, argPtrs.data(), stream);
-    } else {
-      // External cross-sectional argv: (i32 T, i32 S, ptrs...).  These
-      // kernels are time-major and don't multi-chunk along time — the
-      // mask / chunk_size / warmup scalars don't apply.
-      std::vector<void *> argPtrs;
-      argPtrs.reserve(2 + ptrs.size());
-      argPtrs.push_back(&timeLenI32);
-      argPtrs.push_back(&numStocksI32);
-      for (auto &p : ptrs) argPtrs.push_back(&p);
-      launchExtCsKernel(cuFuncs_[kIdx], meta.kind, meta.kernelName,
-                          timeLength, numStocks,
-                          devMaxSmemBytes, smFillFactor, numSMs,
-                          argPtrs.data(), stream);
-    }
-  }
+  std::vector<KernelLaunchDesc> descs =
+      buildKernelLaunchDescs(*plan_, data_, cuFuncs_,
+                             timeLength, numStocks, bufPtrs,
+                             mask, minChunkWarmupFactor, smFillFactor,
+                             devMaxSmemBytes, numSMs);
+  for (const auto &desc : descs)
+    launchKernelDesc(desc, stream);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1013,9 +1188,10 @@ Executor::~Executor() = default;
 void Executor::runGraph(
     Executable &exe, int64_t timeLength, int64_t numStocks,
     const std::vector<std::pair<std::string, uintptr_t>> &args,
-    int64_t mask, int minChunkWarmupFactor, double smFillFactor) {
+    int64_t mask, int minChunkWarmupFactor, double smFillFactor,
+    LaunchMode mode) {
   exe.launchOnStream(this, timeLength, numStocks, args,
-                      mask, minChunkWarmupFactor, smFillFactor);
+                      mask, minChunkWarmupFactor, smFillFactor, mode);
 }
 
 void Executor::synchronize() {
diff --git a/mlir/lib/KunCuda/RuntimeCudaGraph.cpp b/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
new file mode 100644
index 0000000..8fcb285
--- /dev/null
+++ b/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
@@ -0,0 +1,375 @@
+//===- RuntimeCudaGraph.cpp - CUDA Graph launcher for kun_cuda ------------===//
+//
+// CUDA Graph mode builds the real producer/consumer node DAG instead of
+// enqueueing kernels in a topo-linear loop.  Intermediate buffers are
+// graph-owned allocations:
+//
+//   alloc(intermediate) -> producer kernel -> all consumer kernels -> free
+//
+// User-visible graph inputs/outputs remain caller-owned pointers supplied at
+// launch time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunCuda/Runtime.h"
+#include "RuntimeUtil.h"
+
+#include <cuda.h>
+
+#include <algorithm>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace kun_cuda {
+
+namespace {
+
+void addUniqueNode(std::vector<CUgraphNode> &nodes, CUgraphNode n) {
+  if (!n)
+    return;
+  if (std::find(nodes.begin(), nodes.end(), n) == nodes.end())
+    nodes.push_back(n);
+}
+
+bool isIntermediate(const GraphPlan &plan, int bufIdx) {
+  return bufIdx >= firstIntermediateBuffer(plan);
+}
+
+void ensureNoInFlight(CudaGraphLaunchState &state, const char *action) {
+  if (!state.hasLaunch || !state.completionEvent)
+    return;
+
+  CUresult r = cuEventQuery(state.completionEvent);
+  if (r == CUDA_SUCCESS) {
+    state.hasLaunch = false;
+    return;
+  }
+  if (r == CUDA_ERROR_NOT_READY)
+    throw std::runtime_error(
+        std::string("kun_cuda::launchOnStream(cuda_graph): previous CUDA "
+                    "graph launch is still in flight; call synchronize() "
+                    "before ") + action + " the executable's CUDA graph");
+  checkCu(r, "cuEventQuery(cuda graph completion)");
+}
+
+size_t intermediateBytes(const ExecutableData &data,
+                         int64_t timeLength, int64_t numStocks) {
+  size_t bytes = static_cast<size_t>(timeLength) *
+                 static_cast<size_t>(numStocks) *
+                 bytesPerElem(data.dtype);
+  return bytes == 0 ? 1 : bytes;
+}
+
+bool sameLaunchParamsExceptBuffers(const CudaGraphLaunchParams &a,
+                                   const CudaGraphLaunchParams &b) {
+  return a.timeLength == b.timeLength &&
+         a.numStocks == b.numStocks &&
+         a.mask == b.mask &&
+         a.minChunkWarmupFactor == b.minChunkWarmupFactor &&
+         a.smFillFactor == b.smFillFactor &&
+         a.devMaxSmemBytes == b.devMaxSmemBytes &&
+         a.numSMs == b.numSMs;
+}
+
+bool sameLaunchParams(const CudaGraphLaunchParams &a,
+                      const CudaGraphLaunchParams &b) {
+  return sameLaunchParamsExceptBuffers(a, b) && a.bufPtrs == b.bufPtrs;
+}
+
+CudaGraphLaunchParams makeLaunchParams(
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor,
+    std::vector<uintptr_t> bufPtrs) {
+  CudaGraphLaunchParams params;
+  params.timeLength = timeLength;
+  params.numStocks = numStocks;
+  params.mask = mask;
+  params.minChunkWarmupFactor = minChunkWarmupFactor;
+  params.smFillFactor = smFillFactor;
+  params.devMaxSmemBytes = exec->devMaxSmemBytes();
+  params.numSMs = exec->numSMs();
+  params.bufPtrs = std::move(bufPtrs);
+  return params;
+}
+
+std::vector<uintptr_t> resolveCudaGraphBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const CudaGraphLaunchState &state,
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+  std::vector<uintptr_t> bufPtrs =
+      resolveExternalBufferPointers(plan, data, args);
+  for (int b = firstIntermediateBuffer(plan); b < plan.numBuffers; ++b)
+    bufPtrs[b] = state.graphAllocBufPtrs[b];
+  return bufPtrs;
+}
+
+CudaGraphLaunchParams makeLaunchParams(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const CudaGraphLaunchState &state,
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor) {
+  return makeLaunchParams(
+      exec, timeLength, numStocks, mask, minChunkWarmupFactor, smFillFactor,
+      resolveCudaGraphBufferPointers(plan, data, state, args));
+}
+
+CUgraphNode addAllocNode(CUgraph graph,
+                         CUdevice device,
+                         size_t bytes,
+                         const std::vector<CUgraphNode> &deps,
+                         CUdeviceptr &outPtr) {
+  CUDA_MEM_ALLOC_NODE_PARAMS params{};
+  params.poolProps.allocType = CU_MEM_ALLOCATION_TYPE_PINNED;
+  params.poolProps.handleTypes = CU_MEM_HANDLE_TYPE_NONE;
+  params.poolProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  params.poolProps.location.id = device;
+  params.bytesize = bytes;
+
+  CUgraphNode node = nullptr;
+  checkCu(cuGraphAddMemAllocNode(&node, graph,
+                                 deps.empty() ? nullptr : deps.data(),
+                                 deps.size(), &params),
+          "cuGraphAddMemAllocNode");
+  outPtr = params.dptr;
+  return node;
+}
+
+std::vector<CUgraphNode> kernelInputDeps(const GraphPlan &plan,
+                                         const CudaGraphLaunchState &state,
+                                         int kIdx) {
+  std::vector<CUgraphNode> deps;
+  for (int b : plan.kernelInputBufs[kIdx]) {
+    int producer = plan.producerKernel[b];
+    if (producer >= 0)
+      addUniqueNode(deps, state.kernelNodes[producer]);
+  }
+  return deps;
+}
+
+void addOutputAllocNodes(const GraphPlan &plan,
+                         CudaGraphLaunchState &state,
+                         CUdevice device,
+                         size_t bytes,
+                         int kIdx,
+                         const std::vector<CUgraphNode> &inputDeps,
+                         std::vector<uintptr_t> &bufPtrs) {
+  for (int b : plan.kernelOutputBufs[kIdx]) {
+    if (!isIntermediate(plan, b))
+      continue;
+    CUdeviceptr dptr = 0;
+    CUgraphNode allocNode =
+        addAllocNode(state.graph, device, bytes, inputDeps, dptr);
+    state.allocNodes[b] = allocNode;
+    state.graphAllocBufPtrs[b] = static_cast<uintptr_t>(dptr);
+    bufPtrs[b] = static_cast<uintptr_t>(dptr);
+  }
+}
+
+void addOneKernelNode(const GraphPlan &plan,
+                      const ExecutableData &data,
+                      const std::vector<CUfunction> &cuFuncs,
+                      CudaGraphLaunchState &state,
+                      const CudaGraphLaunchParams &launch,
+                      int kIdx,
+                      std::vector<CUgraphNode> deps) {
+  for (int b : plan.kernelOutputBufs[kIdx])
+    if (isIntermediate(plan, b))
+      addUniqueNode(deps, state.allocNodes[b]);
+
+  KernelLaunchDesc &stored = state.descs[kIdx];
+  stored.update(plan, data, cuFuncs, kIdx, launch);
+
+  CUgraphNode node = nullptr;
+  if (stored.isKernelNode) {
+    checkCu(cuGraphAddKernelNode(&node, state.graph,
+                                 deps.empty() ? nullptr : deps.data(),
+                                 deps.size(), &stored.params),
+            "cuGraphAddKernelNode");
+  } else {
+    checkCu(cuGraphAddEmptyNode(&node, state.graph,
+                                deps.empty() ? nullptr : deps.data(),
+                                deps.size()),
+            "cuGraphAddEmptyNode");
+  }
+  state.kernelNodes[kIdx] = node;
+  state.kernelNodeIsKernel[kIdx] = stored.isKernelNode;
+}
+
+void addFreeNodes(const GraphPlan &plan,
+                  CudaGraphLaunchState &state) {
+  for (int b = firstIntermediateBuffer(plan); b < plan.numBuffers; ++b) {
+    std::vector<CUgraphNode> deps;
+    for (int kIdx = 0; kIdx < static_cast<int>(plan.kernelInputBufs.size());
+         ++kIdx) {
+      const auto &ins = plan.kernelInputBufs[kIdx];
+      if (std::find(ins.begin(), ins.end(), b) != ins.end())
+        addUniqueNode(deps, state.kernelNodes[kIdx]);
+    }
+
+    if (deps.empty()) {
+      int producer = plan.producerKernel[b];
+      if (producer >= 0)
+        addUniqueNode(deps, state.kernelNodes[producer]);
+    }
+
+    CUgraphNode freeNode = nullptr;
+    checkCu(cuGraphAddMemFreeNode(
+                &freeNode, state.graph,
+                deps.empty() ? nullptr : deps.data(),
+                deps.size(),
+                static_cast<CUdeviceptr>(state.graphAllocBufPtrs[b])),
+            "cuGraphAddMemFreeNode");
+    state.freeNodes[b] = freeNode;
+  }
+}
+
+void buildCudaGraphState(const GraphPlan &plan,
+                         const ExecutableData &data,
+                         const std::vector<CUfunction> &cuFuncs,
+                         CudaGraphLaunchState &state,
+                         Executor *exec,
+                         int64_t timeLength, int64_t numStocks,
+                         const std::vector<std::pair<std::string, uintptr_t>> &args,
+                         int64_t mask,
+                         int minChunkWarmupFactor,
+                         double smFillFactor) {
+  checkCu(cuGraphCreate(&state.graph, 0), "cuGraphCreate");
+
+  const int nBufs = plan.numBuffers;
+  const int nKernels = static_cast<int>(data.kernels.size());
+  state.graphAllocBufPtrs.assign(nBufs, 0);
+  state.allocNodes.assign(nBufs, nullptr);
+  state.kernelNodes.assign(nKernels, nullptr);
+  state.kernelNodeIsKernel.assign(nKernels, false);
+  state.freeNodes.assign(nBufs, nullptr);
+  state.descs.resize(nKernels);
+
+  CUdevice device = 0;
+  checkCu(cuCtxGetDevice(&device), "cuCtxGetDevice");
+  const size_t bytes = intermediateBytes(data, timeLength, numStocks);
+
+  CudaGraphLaunchParams launch = makeLaunchParams(
+      exec, timeLength, numStocks, mask, minChunkWarmupFactor, smFillFactor,
+      resolveExternalBufferPointers(plan, data, args));
+
+  for (int kIdx : plan.launchOrder) {
+    std::vector<CUgraphNode> deps = kernelInputDeps(plan, state, kIdx);
+    addOutputAllocNodes(plan, state, device, bytes, kIdx, deps,
+                        launch.bufPtrs);
+    addOneKernelNode(plan, data, cuFuncs, state, launch, kIdx,
+                     std::move(deps));
+  }
+
+  addFreeNodes(plan, state);
+  checkCu(cuGraphInstantiate(&state.graphExec, state.graph, 0),
+          "cuGraphInstantiate");
+  state.cachedLaunchParams = std::move(launch);
+}
+
+void updateCudaGraphKernelParams(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<CUfunction> &cuFuncs,
+    CudaGraphLaunchState &state,
+    const CudaGraphLaunchParams &cached,
+    const CudaGraphLaunchParams &launch) {
+  const bool bufferOnly = sameLaunchParamsExceptBuffers(cached, launch);
+  for (int kIdx : plan.launchOrder) {
+    KernelLaunchDesc &desc = state.descs[kIdx];
+    bool changed = false;
+    if (bufferOnly) {
+      changed = desc.updateBuffer(plan, kIdx, launch.bufPtrs);
+    } else {
+      desc.update(plan, data, cuFuncs, kIdx, launch);
+      changed = true;
+    }
+    if (!changed)
+      continue;
+    if (state.kernelNodeIsKernel[desc.kernelIndex] != desc.isKernelNode)
+      throw std::runtime_error(
+          "kun_cuda::launchOnStream(cuda_graph): kernel/empty node shape "
+          "changed without graph rebuild");
+    if (!desc.isKernelNode)
+      continue;
+    checkCu(cuGraphExecKernelNodeSetParams(
+                state.graphExec, state.kernelNodes[desc.kernelIndex],
+                &desc.params),
+            "cuGraphExecKernelNodeSetParams");
+  }
+}
+
+} // namespace
+
+CudaGraphLaunchState::~CudaGraphLaunchState() noexcept {
+  if (hasLaunch && completionEvent)
+    (void)cuEventSynchronize(completionEvent);
+  if (graphExec)
+    (void)cuGraphExecDestroy(graphExec);
+  if (graph)
+    (void)cuGraphDestroy(graph);
+  if (completionEvent)
+    (void)cuEventDestroy(completionEvent);
+}
+
+void Executable::launchCudaGraphOnStream(
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor) {
+  if (!cudaGraphState_)
+    cudaGraphState_ = std::make_unique<CudaGraphLaunchState>();
+
+  const bool needRebuild =
+      !cudaGraphState_->graphExec ||
+      !cudaGraphState_->cachedLaunchParams ||
+      cudaGraphState_->cachedLaunchParams->timeLength != timeLength ||
+      cudaGraphState_->cachedLaunchParams->numStocks != numStocks;
+
+  if (needRebuild) {
+    ensureNoInFlight(*cudaGraphState_, "rebuilding");
+    resetCudaGraphState();
+    cudaGraphState_ = std::make_unique<CudaGraphLaunchState>();
+    buildCudaGraphState(*plan_, data_, cuFuncs_, *cudaGraphState_,
+                        exec, timeLength, numStocks, args,
+                        mask, minChunkWarmupFactor, smFillFactor);
+  } else {
+    ensureNoInFlight(*cudaGraphState_, "updating");
+    CudaGraphLaunchParams launch = makeLaunchParams(
+        *plan_, data_, *cudaGraphState_, exec, timeLength, numStocks, args,
+        mask, minChunkWarmupFactor, smFillFactor);
+    if (!sameLaunchParams(*cudaGraphState_->cachedLaunchParams, launch)) {
+      updateCudaGraphKernelParams(*plan_, data_, cuFuncs_, *cudaGraphState_,
+                                  *cudaGraphState_->cachedLaunchParams,
+                                  launch);
+      cudaGraphState_->cachedLaunchParams = std::move(launch);
+    }
+  }
+
+  checkCu(cuGraphLaunch(cudaGraphState_->graphExec, exec->stream()),
+          "cuGraphLaunch");
+  if (!cudaGraphState_->completionEvent)
+    checkCu(cuEventCreate(&cudaGraphState_->completionEvent,
+                          CU_EVENT_DISABLE_TIMING),
+            "cuEventCreate(cuda graph completion)");
+  checkCu(cuEventRecord(cudaGraphState_->completionEvent, exec->stream()),
+          "cuEventRecord(cuda graph completion)");
+  cudaGraphState_->hasLaunch = true;
+}
+
+void Executable::resetCudaGraphState() noexcept {
+  cudaGraphState_.reset();
+}
+
+} // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/RuntimeUtil.h b/mlir/lib/KunCuda/RuntimeUtil.h
new file mode 100644
index 0000000..a8606c9
--- /dev/null
+++ b/mlir/lib/KunCuda/RuntimeUtil.h
@@ -0,0 +1,153 @@
+//===- RuntimeUtil.h - private kun_cuda runtime helpers ------------------===//
+//
+// This header is private to libKunCudaRuntime.  It holds the pieces shared by
+// the traditional sequential launcher and the CUDA Graph launcher.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "KunCuda/Runtime.h"
+
+#include <cuda.h>
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace kun_cuda {
+
+// Runtime-resolved schedule + memory plan.  The public header forward-declares
+// this type so Executable can keep it behind a pImpl.
+struct GraphPlan {
+  int numBuffers       = 0;
+  int numGraphInputs   = 0;
+  int numGraphOutputs  = 0;
+
+  std::unordered_map<std::string, int> graphInputIdx;
+  std::unordered_map<std::string, int> graphOutputIdx;
+
+  std::vector<std::vector<int>> kernelInputBufs;
+  std::vector<std::vector<int>> kernelOutputBufs;
+
+  // producerKernel[bufIdx] = kernel that writes the buffer, or -1 for a graph
+  // input.
+  std::vector<int> producerKernel;
+
+  // Topo order used by the sequential launcher and as a construction order for
+  // CUDA Graph kernel nodes.
+  std::vector<int> launchOrder;
+
+  // Sequential-mode intermediate slot assignment.  Graph mode uses one
+  // allocation node per logical intermediate instead.
+  std::vector<int> intermediateBufToSlot;
+  int peakIntermediateSlots = 0;
+};
+
+struct ChunkPlan {
+  int64_t chunkSize = 0;
+  unsigned numChunks = 1;
+};
+
+struct CudaGraphLaunchParams {
+  int64_t timeLength = 0;
+  int64_t numStocks = 0;
+  int64_t mask = 0;
+  int minChunkWarmupFactor = 0;
+  double smFillFactor = 0.0;
+  int devMaxSmemBytes = 0;
+  int numSMs = 0;
+  std::vector<uintptr_t> bufPtrs;
+};
+
+struct KernelLaunchDesc {
+  KernelLaunchDesc() = default;
+  KernelLaunchDesc(int kernelIndex, KernelKind kind, CUfunction fn,
+                   bool isKernelNode,
+                   int32_t timeLenI32, int32_t numStocksI32,
+                   int32_t maskI32, int32_t chunkSizeI32,
+                   int32_t warmupI32,
+                   std::vector<CUdeviceptr> ptrs);
+  KernelLaunchDesc(KernelLaunchDesc &&other) noexcept;
+  KernelLaunchDesc &operator=(KernelLaunchDesc &&other) noexcept;
+  KernelLaunchDesc(const KernelLaunchDesc &) = delete;
+  KernelLaunchDesc &operator=(const KernelLaunchDesc &) = delete;
+
+  void update(const GraphPlan &plan,
+              const ExecutableData &data,
+              const std::vector<CUfunction> &cuFuncs,
+              int kernelIndex,
+              const CudaGraphLaunchParams &launch);
+  bool updateBuffer(const GraphPlan &plan,
+                    int kernelIndex,
+                    const std::vector<uintptr_t> &bufPtrs);
+
+  int kernelIndex = -1;
+  KernelKind kind = KernelKind::Jit;
+  bool isKernelNode = true;
+
+  int32_t timeLenI32 = 0;
+  int32_t numStocksI32 = 0;
+  int32_t maskI32 = 0;
+  int32_t chunkSizeI32 = 0;
+  int32_t warmupI32 = 0;
+
+  CUDA_KERNEL_NODE_PARAMS params{};
+
+private:
+  std::vector<CUdeviceptr> ptrs_;
+  std::vector<void *> argPtrs_;
+
+  void rebuildKernelParamPointers();
+};
+
+struct CudaGraphLaunchState {
+  ~CudaGraphLaunchState() noexcept;
+
+  CUgraph graph = nullptr;
+  CUgraphExec graphExec = nullptr;
+  CUevent completionEvent = nullptr;
+  bool hasLaunch = false;
+
+  std::optional<CudaGraphLaunchParams> cachedLaunchParams;
+
+  std::vector<uintptr_t> graphAllocBufPtrs;
+  std::vector<CUgraphNode> allocNodes;
+  std::vector<CUgraphNode> kernelNodes;
+  std::vector<bool> kernelNodeIsKernel;
+  std::vector<CUgraphNode> freeNodes;
+  std::vector<KernelLaunchDesc> descs;
+};
+
+void checkCu(CUresult r, const char *what);
+
+std::string joinNames(const std::vector<std::string> &v);
+
+int firstIntermediateBuffer(const GraphPlan &plan) noexcept;
+
+void validateLaunchInputs(const ExecutableData &data,
+                          int64_t timeLength, int64_t numStocks,
+                          int64_t mask);
+
+std::vector<uintptr_t> resolveExternalBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args);
+
+std::vector<uintptr_t> resolveBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    const std::vector<uintptr_t> &slotBufs);
+
+ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
+                           int64_t warpsPerCta, int64_t vectorSize,
+                           int64_t unreliableCount, int64_t mask,
+                           int minChunkWarmupFactor,
+                           double smFillFactor, int numSMs);
+
+void launchKernelDesc(const KernelLaunchDesc &desc, CUstream stream);
+
+} // namespace kun_cuda
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index ceaa02e..ffa6fc6 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -570,7 +570,8 @@ NB_MODULE(KunMLIR, m) {
           [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
               nb::dict pyInputs, int64_t cur_time, int64_t length,
               nb::object pyOutputs, int64_t mask,
-              int minChunkWarmupFactor, double smFillFactor) -> nb::dict {
+              int minChunkWarmupFactor, double smFillFactor,
+              bool useCudaGraph) -> nb::dict {
             if (cur_time != 0)
               throw std::runtime_error(
                   "runGraph: cur_time != 0 not supported on GPU");
@@ -610,7 +611,9 @@ NB_MODULE(KunMLIR, m) {
                                             in.numStocks, streamArg, args);
 
             e.runGraph(exe, timeLength, in.numStocks, args,
-                        mask, minChunkWarmupFactor, smFillFactor);
+                        mask, minChunkWarmupFactor, smFillFactor,
+                        useCudaGraph ? kun_cuda::LaunchMode::CudaGraph
+                                     : kun_cuda::LaunchMode::Normal);
             return ret;
           },
           nb::arg("exe"), nb::arg("inputs"),
@@ -619,6 +622,7 @@ NB_MODULE(KunMLIR, m) {
           nb::arg("mask") = 0,
           nb::arg("min_chunk_warmup_factor") = 4,
           nb::arg("sm_fill_factor") = 1.5,
+          nb::arg("use_cuda_graph") = false,
           "Queue every kernel in `exe` onto this executor's stream.\n"
           "**Asynchronous** — call `.synchronize()` (or otherwise wait\n"
           "on the stream) before reading results back to host.\n"
@@ -651,6 +655,9 @@ NB_MODULE(KunMLIR, m) {
           "`sm_fill_factor` is the target `num_chunks * stock_tiles / "
           "numSMs`.  1.0 just fills the GPU; > 1 leaves scheduler "
           "slack.  Default 1.5.\n"
+          "`use_cuda_graph=True` launches through a CUDA Graph node DAG "
+          "with graph allocation/free nodes for intermediate buffers. "
+          "Default false keeps the existing sequential launch path.\n"
           "\n"
           "Named to match the CPU executor API "
           "(`KunRunner.runGraph(executor, mod, inputs, cur_time, length)`).")
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index 9b9ec60..b87c1ad 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -50,6 +50,7 @@ def main() -> int:
     ap.add_argument("--target", default="sm_120")
     ap.add_argument("-T", "--time-length", type=int, default=64)
     ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    ap.add_argument("--use-cuda-graph", action="store_true")
     args = ap.parse_args()
 
     from KunQuant.jit import KunMLIR
@@ -108,7 +109,9 @@ def main() -> int:
     print()
     print(f"=== launch ({T} × {S}) — default stream ===")
     print(f"  executor.stream = {executor.stream}  (0 ↔ CUDA default)")
-    executor.runGraph(exe, {"a": a, "b": b, "c": c, "out": out})
+    executor.runGraph(exe, {"a": a, "b": b, "c": c},
+                      outputs={"out": out},
+                      use_cuda_graph=args.use_cuda_graph)
     out_h = cp.asnumpy(out)
 
     expected = (a_h + b_h) * c_h
@@ -137,7 +140,9 @@ def main() -> int:
     print(f"  executor.stream = {hex(executor2.stream)}")
     assert executor2.stream == cp_stream.ptr, \
         (executor2.stream, cp_stream.ptr)
-    executor2.runGraph(exe, {"a": a2, "b": b2, "c": c2, "out": out2})
+    executor2.runGraph(exe, {"a": a2, "b": b2, "c": c2},
+                       outputs={"out": out2},
+                       use_cuda_graph=args.use_cuda_graph)
     # Sync is REQUIRED here: cp_stream is non-blocking, so cp.asnumpy's
     # D2H memcpy on cupy's default stream wouldn't otherwise wait for
     # our kernels.
diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index 796dbfc..fd69256 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -20,6 +20,7 @@
 _argp.add_argument("action", nargs="?")
 _argp.add_argument("--gpu-arch", default="")
 _argp.add_argument("--benchmode", action="store_true")
+_argp.add_argument("--use-cuda-graph", action="store_true")
 _argp.add_argument("--time", type=int, default=260)
 _argp.add_argument("--num-stocks", type=int, default=64)
 _argp.add_argument("--num-threads", type=int, default=4)
@@ -29,6 +30,7 @@
 GPU_ARCH = _args.gpu_arch or ("sm_80" if action == "run_gpu" else "")
 GPU_MODE = bool(GPU_ARCH)
 BENCHMODE = _args.benchmode
+USE_CUDA_GRAPH = _args.use_cuda_graph
 TIME = _args.time
 NUM_STOCKS = _args.num_stocks
 NUM_THREADS = _args.num_threads
@@ -179,8 +181,11 @@ def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None,
     if not benchmode:
         gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
     ret = executor.runGraph(modu, gpu_inputs, cur_time=cur_time,
-                            length=length)
+                            length=length,
+                            use_cuda_graph=USE_CUDA_GRAPH)
     if benchmode:
+        if USE_CUDA_GRAPH:
+            executor.synchronize()
         return ret
     executor.synchronize()
 

From d5f3ff069f0d7e607c6334526fc9d275f46e1790 Mon Sep 17 00:00:00 2001
From: Menooker <myjisgreat@live.cn>
Date: Sat, 23 May 2026 14:25:32 +0800
Subject: [PATCH 44/59] llvm workflow

---
 .github/workflows/build-llvm.yml | 139 +++++++++++++++++++++++++++++++
 mlir/llvm_commit.txt             |   1 +
 2 files changed, 140 insertions(+)
 create mode 100644 .github/workflows/build-llvm.yml
 create mode 100644 mlir/llvm_commit.txt

diff --git a/.github/workflows/build-llvm.yml b/.github/workflows/build-llvm.yml
new file mode 100644
index 0000000..144bcb4
--- /dev/null
+++ b/.github/workflows/build-llvm.yml
@@ -0,0 +1,139 @@
+name: Build LLVM+MLIR (static & dynamic)
+
+on:
+  workflow_dispatch: {}
+
+jobs:
+  build:
+    name: Build (static & dynamic matrix)
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        link: [static, dynamic]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Read LLVM commit from file
+        id: read_commit
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs');
+            const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
+            const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
+            if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
+            return commit;
+
+      - name: Checkout llvm-project at commit
+        uses: actions/checkout@v4
+        with:
+          repository: llvm/llvm-project
+          ref: ${{ steps.read_commit.outputs.result }}
+          path: llvm-project
+          fetch-depth: 1  # shallow clone, only the specified ref
+
+      - name: Install build dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends build-essential cmake ninja-build git python3 ca-certificates pkg-config curl unzip
+
+      - name: Configure, build and install
+        run: echo "Using workflow steps for configure/build/install"
+
+      - name: Configure (static)
+        if: matrix.link == 'static'
+        run: |
+          cmake -S llvm-project/llvm -B build-static -G Ninja \
+            -DLLVM_ENABLE_PROJECTS="mlir" \
+            -DLLVM_TARGETS_TO_BUILD="NVPTX" \
+            -DCMAKE_BUILD_TYPE=MinSizeRel \
+            -DCMAKE_INSTALL_PREFIX=install
+
+      - name: Configure (dynamic)
+        if: matrix.link == 'dynamic'
+        run: |
+          cmake -S llvm-project/llvm -B build-dynamic -G Ninja \
+            -DLLVM_ENABLE_PROJECTS="mlir" \
+            -DLLVM_TARGETS_TO_BUILD="NVPTX" \
+            -DCMAKE_BUILD_TYPE=MinSizeRel \
+            -DMLIR_BUILD_LLVM_DYLIB=ON \
+            -DMLIR_LINK_LLVM_DYLIB=ON \
+            -DLLVM_BUILD_LLVM_DYLIB=ON \
+            -DLLVM_LINK_LLVM_DYLIB=ON \
+            -DCMAKE_INSTALL_PREFIX=install
+
+      - name: Build and install
+        run: |
+          cmake --build build-${{ matrix.link }} -j"$(nproc)" --target install
+
+      - name: Create archive of install
+        run: |
+          mkdir -p artifacts
+          COMMIT=${{ steps.read_commit.outputs.result }}
+          tar -C build-${{ matrix.link }} -czf artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz install
+          echo "Created artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz"
+
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llvm-mlir-install-${{ matrix.link }}
+          path: artifacts/llvm-mlir-install-${{ matrix.link }}-*.tar.gz
+
+  publish:
+    name: Publish release with artifacts
+    runs-on: ubuntu-22.04
+    needs: build
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Download static artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: llvm-mlir-install-static
+          path: artifacts
+
+      - name: Download dynamic artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: llvm-mlir-install-dynamic
+          path: artifacts
+
+      - name: Read LLVM commit from file
+        id: read_commit_publish
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs');
+            const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
+            const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
+            if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
+            return commit;
+
+      - name: Create GitHub Release
+        id: create_release
+        uses: actions/create-release@v1
+        with:
+          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.result }}
+          release_name: LLVM-MLIR ${{ steps.read_commit_publish.outputs.result }}
+          body: "Automated build artifacts for LLVM+MLIR"
+          draft: false
+          prerelease: false
+
+      - name: Upload static asset to release
+        uses: actions/upload-release-asset@v1
+        with:
+          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          asset_path: artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
+          asset_name: llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
+          asset_content_type: application/gzip
+
+      - name: Upload dynamic asset to release
+        uses: actions/upload-release-asset@v1
+        with:
+          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          asset_path: artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
+          asset_name: llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
+          asset_content_type: application/gzip
diff --git a/mlir/llvm_commit.txt b/mlir/llvm_commit.txt
new file mode 100644
index 0000000..2f3d886
--- /dev/null
+++ b/mlir/llvm_commit.txt
@@ -0,0 +1 @@
+a8e55950fdc19c23ad78c8aeeea5a907690c4b6d
\ No newline at end of file

From 6a3b399afe9b26f0a3a46ee5fdf2433472dbba6f Mon Sep 17 00:00:00 2001
From: Menooker <myjisgreat@live.cn>
Date: Sat, 23 May 2026 14:39:28 +0800
Subject: [PATCH 45/59] checkout tag

---
 .github/workflows/build-llvm.yml | 50 ++++++++++++--------------------
 mlir/llvm_commit.txt             |  2 +-
 2 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/build-llvm.yml b/.github/workflows/build-llvm.yml
index 144bcb4..3708920 100644
--- a/.github/workflows/build-llvm.yml
+++ b/.github/workflows/build-llvm.yml
@@ -24,15 +24,18 @@ jobs:
             const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
             const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
             if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
-            return commit;
+            core.setOutput('commit', commit.trim());
+            return commit.trim();
 
       - name: Checkout llvm-project at commit
         uses: actions/checkout@v4
         with:
           repository: llvm/llvm-project
-          ref: ${{ steps.read_commit.outputs.result }}
+          # treat the value as a tag name and fetch tags so shallow fetch can find it
+          ref: refs/tags/${{ steps.read_commit.outputs.commit }}
           path: llvm-project
-          fetch-depth: 1  # shallow clone, only the specified ref
+          fetch-depth: 1
+          fetch-tags: true
 
       - name: Install build dependencies
         run: |
@@ -49,7 +52,8 @@ jobs:
             -DLLVM_ENABLE_PROJECTS="mlir" \
             -DLLVM_TARGETS_TO_BUILD="NVPTX" \
             -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DCMAKE_INSTALL_PREFIX=install
+            -DLLVM_BUILD_TOOLS=OFF \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/build-static/install
 
       - name: Configure (dynamic)
         if: matrix.link == 'dynamic'
@@ -58,11 +62,12 @@ jobs:
             -DLLVM_ENABLE_PROJECTS="mlir" \
             -DLLVM_TARGETS_TO_BUILD="NVPTX" \
             -DCMAKE_BUILD_TYPE=MinSizeRel \
+            -DLLVM_BUILD_TOOLS=OFF \
             -DMLIR_BUILD_LLVM_DYLIB=ON \
             -DMLIR_LINK_LLVM_DYLIB=ON \
             -DLLVM_BUILD_LLVM_DYLIB=ON \
             -DLLVM_LINK_LLVM_DYLIB=ON \
-            -DCMAKE_INSTALL_PREFIX=install
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/build-dynamic/install
 
       - name: Build and install
         run: |
@@ -71,7 +76,7 @@ jobs:
       - name: Create archive of install
         run: |
           mkdir -p artifacts
-          COMMIT=${{ steps.read_commit.outputs.result }}
+          COMMIT=${{ steps.read_commit.outputs.commit }}
           tar -C build-${{ matrix.link }} -czf artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz install
           echo "Created artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz"
 
@@ -110,30 +115,13 @@ jobs:
             const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
             const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
             if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
+            core.setOutput('commit', commit);
             return commit;
-
-      - name: Create GitHub Release
-        id: create_release
-        uses: actions/create-release@v1
-        with:
-          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.result }}
-          release_name: LLVM-MLIR ${{ steps.read_commit_publish.outputs.result }}
-          body: "Automated build artifacts for LLVM+MLIR"
-          draft: false
-          prerelease: false
-
-      - name: Upload static asset to release
-        uses: actions/upload-release-asset@v1
-        with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
-          asset_path: artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_name: llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_content_type: application/gzip
-
-      - name: Upload dynamic asset to release
-        uses: actions/upload-release-asset@v1
+      - name: Release
+        uses: softprops/action-gh-release@v3
         with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
-          asset_path: artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_name: llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_content_type: application/gzip
+          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.commit }}
+          name: LLVM-MLIR prebuilt ${{ steps.read_commit_publish.outputs.commit }}
+          files: |
+            artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.commit }}.tar.gz
+            artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.commit }}.tar.gz
\ No newline at end of file
diff --git a/mlir/llvm_commit.txt b/mlir/llvm_commit.txt
index 2f3d886..391f543 100644
--- a/mlir/llvm_commit.txt
+++ b/mlir/llvm_commit.txt
@@ -1 +1 @@
-a8e55950fdc19c23ad78c8aeeea5a907690c4b6d
\ No newline at end of file
+llvmorg-22.1.6
\ No newline at end of file

From bf20f5a9ae45bcd3525e25fcc070a2430d6b8020 Mon Sep 17 00:00:00 2001
From: Menooker <myjisgreat@live.cn>
Date: Sat, 23 May 2026 14:39:28 +0800
Subject: [PATCH 46/59] checkout tag

---
 .github/workflows/build-llvm.yml | 58 ++++++++++++++------------------
 mlir/llvm_commit.txt             |  2 +-
 2 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/build-llvm.yml b/.github/workflows/build-llvm.yml
index 144bcb4..8340c83 100644
--- a/.github/workflows/build-llvm.yml
+++ b/.github/workflows/build-llvm.yml
@@ -24,15 +24,18 @@ jobs:
             const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
             const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
             if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
-            return commit;
+            core.setOutput('commit', commit.trim());
+            return commit.trim();
 
       - name: Checkout llvm-project at commit
         uses: actions/checkout@v4
         with:
           repository: llvm/llvm-project
-          ref: ${{ steps.read_commit.outputs.result }}
+          # treat the value as a tag name and fetch tags so shallow fetch can find it
+          ref: refs/tags/${{ steps.read_commit.outputs.commit }}
           path: llvm-project
-          fetch-depth: 1  # shallow clone, only the specified ref
+          fetch-depth: 1
+          fetch-tags: true
 
       - name: Install build dependencies
         run: |
@@ -49,7 +52,10 @@ jobs:
             -DLLVM_ENABLE_PROJECTS="mlir" \
             -DLLVM_TARGETS_TO_BUILD="NVPTX" \
             -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DCMAKE_INSTALL_PREFIX=install
+            -DLLVM_BUILD_TOOLS=OFF \
+            -DLLVM_BUILD_TESTS=ON \
+            -DLLVM_INSTALL_UTILS=ON \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/build-static/install
 
       - name: Configure (dynamic)
         if: matrix.link == 'dynamic'
@@ -58,11 +64,14 @@ jobs:
             -DLLVM_ENABLE_PROJECTS="mlir" \
             -DLLVM_TARGETS_TO_BUILD="NVPTX" \
             -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DMLIR_BUILD_LLVM_DYLIB=ON \
-            -DMLIR_LINK_LLVM_DYLIB=ON \
+            -DLLVM_BUILD_TOOLS=OFF \
+            -DMLIR_BUILD_MLIR_DYLIB=ON \
+            -DMLIR_LINK_MLIR_DYLIB=ON \
             -DLLVM_BUILD_LLVM_DYLIB=ON \
             -DLLVM_LINK_LLVM_DYLIB=ON \
-            -DCMAKE_INSTALL_PREFIX=install
+            -DLLVM_BUILD_TESTS=ON \
+            -DLLVM_INSTALL_UTILS=ON \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/build-dynamic/install
 
       - name: Build and install
         run: |
@@ -71,7 +80,7 @@ jobs:
       - name: Create archive of install
         run: |
           mkdir -p artifacts
-          COMMIT=${{ steps.read_commit.outputs.result }}
+          COMMIT=${{ steps.read_commit.outputs.commit }}
           tar -C build-${{ matrix.link }} -czf artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz install
           echo "Created artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz"
 
@@ -110,30 +119,13 @@ jobs:
             const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
             const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
             if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
+            core.setOutput('commit', commit);
             return commit;
-
-      - name: Create GitHub Release
-        id: create_release
-        uses: actions/create-release@v1
-        with:
-          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.result }}
-          release_name: LLVM-MLIR ${{ steps.read_commit_publish.outputs.result }}
-          body: "Automated build artifacts for LLVM+MLIR"
-          draft: false
-          prerelease: false
-
-      - name: Upload static asset to release
-        uses: actions/upload-release-asset@v1
-        with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
-          asset_path: artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_name: llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_content_type: application/gzip
-
-      - name: Upload dynamic asset to release
-        uses: actions/upload-release-asset@v1
+      - name: Release
+        uses: softprops/action-gh-release@v3
         with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
-          asset_path: artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_name: llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_content_type: application/gzip
+          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.commit }}
+          name: LLVM-MLIR prebuilt ${{ steps.read_commit_publish.outputs.commit }}
+          files: |
+            artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.commit }}.tar.gz
+            artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.commit }}.tar.gz
\ No newline at end of file
diff --git a/mlir/llvm_commit.txt b/mlir/llvm_commit.txt
index 2f3d886..391f543 100644
--- a/mlir/llvm_commit.txt
+++ b/mlir/llvm_commit.txt
@@ -1 +1 @@
-a8e55950fdc19c23ad78c8aeeea5a907690c4b6d
\ No newline at end of file
+llvmorg-22.1.6
\ No newline at end of file

From f0a78a20d7fbe78a6f6fc57441220abb9a5e57b1 Mon Sep 17 00:00:00 2001
From: Menooker <myjisgreat@live.cn>
Date: Sun, 24 May 2026 22:03:16 +0800
Subject: [PATCH 47/59] make it run on sm61 and llvm 22

---
 mlir/include/KunIr/KunIrOps.h    | 4 ++--
 mlir/lib/KunCuda/CMakeLists.txt  | 1 +
 mlir/lib/KunCuda/EmbedFile.cmake | 2 ++
 mlir/lib/KunGpu/KunGpuToLLVM.cpp | 4 +++-
 mlir/lib/KunIr/KunIrOps.cpp      | 3 ++-
 5 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/mlir/include/KunIr/KunIrOps.h b/mlir/include/KunIr/KunIrOps.h
index 4398973..43f0a07 100644
--- a/mlir/include/KunIr/KunIrOps.h
+++ b/mlir/include/KunIr/KunIrOps.h
@@ -31,7 +31,7 @@ class KunIrElemwiseTsResultType
   static mlir::LogicalResult inferReturnTypes(
       mlir::MLIRContext *ctx, std::optional<mlir::Location>,
       mlir::ValueRange operands, mlir::DictionaryAttr,
-      mlir::PropertyRef, mlir::RegionRange,
+      mlir::OpaqueProperties , mlir::RegionRange,
       llvm::SmallVectorImpl<mlir::Type> &inferred) {
     auto inputTy = llvm::cast<::kunir::TsType>(operands[0].getType());
     inferred.push_back(
@@ -50,7 +50,7 @@ class KunIrCmpTsResultType
   static mlir::LogicalResult inferReturnTypes(
       mlir::MLIRContext *ctx, std::optional<mlir::Location>,
       mlir::ValueRange, mlir::DictionaryAttr,
-      mlir::PropertyRef, mlir::RegionRange,
+      mlir::OpaqueProperties , mlir::RegionRange,
       llvm::SmallVectorImpl<mlir::Type> &inferred) {
     inferred.push_back(
         ::kunir::TsType::get(ctx, mlir::IntegerType::get(ctx, 1), 1));
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 9a3d8a7..9b2374b 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -71,6 +71,7 @@ function(kun_add_bundled_ptx_kernel cu_path)
               -DOUTPUT=${_inc}
               -DSYMBOL=${_symbol}
               -DPTX_VERSION=7.8
+              -DSM_VERSION=61
               -P "${_kun_embed_cmake}"
       DEPENDS ${_obj_tgt} $<TARGET_OBJECTS:${_obj_tgt}> "${_kun_embed_cmake}"
       COMMENT "Embedding ${_stem}.ptx as ${_symbol}[] (downgrading to ISA 7.8)"
diff --git a/mlir/lib/KunCuda/EmbedFile.cmake b/mlir/lib/KunCuda/EmbedFile.cmake
index 4b82922..6c71895 100644
--- a/mlir/lib/KunCuda/EmbedFile.cmake
+++ b/mlir/lib/KunCuda/EmbedFile.cmake
@@ -21,6 +21,8 @@ if(PTX_VERSION)
   file(READ "${INPUT}" text_content)
   string(REGEX REPLACE "\\.version[ \\t]+[0-9.]+" ".version ${PTX_VERSION}"
                        text_content "${text_content}")
+  string(REGEX REPLACE "\\.target[ \\t]+sm_[0-9]+" ".target sm_${SM_VERSION}"
+                       text_content "${text_content}")
   set(_patched "${OUTPUT}.raw.ptx")
   file(WRITE "${_patched}" "${text_content}")
   file(READ "${_patched}" hex_content HEX)
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
index ef2f833..ff0b4d3 100644
--- a/mlir/lib/KunGpu/KunGpuToLLVM.cpp
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -169,7 +169,9 @@ static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
       b, loc, fn.getSymName(), FunctionType::get(ctx, newArgTypes, {}));
   // Mark as a kernel (sets the op-level `kernel` attribute) so that
   // convert-gpu-to-nvvm tags the resulting llvm.func with `nvvm.kernel`.
-  newFunc.setKernelAttr(UnitAttr::get(ctx));
+  // newFunc.setKernelAttr(UnitAttr::get(ctx));
+  newFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                            UnitAttr::get(ctx));
   setFuncTargetSpec (newFunc, fn.getTargetSpecAttr());
   setFuncInputNames (newFunc, fn.getInputNames());
   setFuncOutputNames(newFunc, fn.getOutputNames());
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
index 61b7ce7..32c1069 100644
--- a/mlir/lib/KunIr/KunIrOps.cpp
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -118,10 +118,11 @@ LogicalResult SelectOp::verify() {
   return success();
 }
 
+// OpaqueProperties -> PropertyRef
 // Result type: ts<true_value.elem, 1>.
 LogicalResult SelectOp::inferReturnTypes(
     MLIRContext *ctx, std::optional<Location>, ValueRange operands,
-    DictionaryAttr, PropertyRef, RegionRange,
+    DictionaryAttr, OpaqueProperties , RegionRange,
     SmallVectorImpl<Type> &inferred) {
   auto trueTy = llvm::cast<TsType>(operands[1].getType());
   inferred.push_back(TsType::get(ctx, trueTy.getElementType(), 1));

From a78602bda1b469b00fd0188a2b83067fa7a658b8 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 24 May 2026 20:37:46 -0700
Subject: [PATCH 48/59] fix tests

---
 mlir/test/kungpu/lower_to_llvm.mlir      |  9 +--
 mlir/test/kunir/lower_to_kungpu.mlir     | 10 +--
 mlir/test/python/test_cs_rank_cuda.py    | 25 ++++----
 mlir/test/python/test_kun_mlir.py        |  4 +-
 mlir/test/python/test_validation_cuda.py | 81 ++++++++++++++++--------
 mlir/test/python/test_windowed_temp.py   |  4 +-
 6 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/mlir/test/kungpu/lower_to_llvm.mlir b/mlir/test/kungpu/lower_to_llvm.mlir
index 0736709..69b3d60 100644
--- a/mlir/test/kungpu/lower_to_llvm.mlir
+++ b/mlir/test/kungpu/lower_to_llvm.mlir
@@ -107,9 +107,7 @@ kunir.func @test_time_bounds(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 // ── Per-function chunk write_start cache, lazily inserted at entry ────
 // All chunk arithmetic stays in i32 (64-bit ops are slow on GPU); only
 // the final write_start gets an index_cast for comparing against the
-// index-typed scf.for IV.  Mask cast (separate, used for t-mask subi
-// inside the loop) hoists to entry too.
-// CHECK:       %[[MASK:.*]] = arith.index_cast %[[MASK_I32]] : i32 to index
+// index-typed scf.for IV.
 // CHECK:       %[[CY_IDX:.*]] = gpu.block_id y
 // CHECK:       %[[CY:.*]] = arith.index_cast %[[CY_IDX]] : index to i32
 // CHECK:       %[[CYC0:.*]] = arith.constant 0 : i32
@@ -159,12 +157,11 @@ kunir.func @test_time_bounds(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[IN]][%[[LIN]]] {{.*}} -> !llvm.ptr, f32
 // CHECK:         %[[V:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> f32
 //
-// ── ts.put on global %out: gated by t ≥ write_start, output index t-mask ──
+// ── ts.put on global %out: gated by t ≥ write_start, output index t ──
 // CHECK:         %[[DOW:.*]] = arith.cmpi sge, %[[T]], %[[WSTART]] : index
 // CHECK:         scf.if %[[DOW]] {
-// CHECK:           %[[TOUT:.*]] = arith.subi %[[T]], %[[MASK]] : index
 // CHECK:           %[[NS64B:.*]] = arith.extsi %[[NS]] : i32 to i64
-// CHECK:           %[[T64:.*]] = arith.index_cast %[[TOUT]] : index to i64
+// CHECK:           %[[T64:.*]] = arith.index_cast %[[T]] : index to i64
 // CHECK:           %[[ROW2:.*]] = arith.muli %[[T64]], %[[NS64B]] : i64
 // CHECK:           %[[LIN2:.*]] = arith.addi %[[ROW2]],
 // CHECK:           %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
diff --git a/mlir/test/kunir/lower_to_kungpu.mlir b/mlir/test/kunir/lower_to_kungpu.mlir
index bd8a6ad..833de39 100644
--- a/mlir/test/kunir/lower_to_kungpu.mlir
+++ b/mlir/test/kunir/lower_to_kungpu.mlir
@@ -5,7 +5,9 @@
 // mask / chunk_size / warmup) are prepended later by convert-kungpu-to-llvm.
 // CHECK-SAME: !kunir.ts<f32, inf>
 // CHECK-SAME: !kunir.ts<f32, inf>
-// CHECK-SAME: !kunir.ts<f32, 1>
+// Graph output buffers are full TS arrays; the per-op result window has
+// already been materialized into the loop body.
+// CHECK-SAME: !kunir.ts<f32, inf>
 // CHECK-NOT: -> !kunir.ts
 kunir.func @test_binary_lower(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
     inputs {%a = "a", %b = "b"}
@@ -104,7 +106,7 @@ kunir.func @test_computed_reduce(%x: !kunir.ts<f32, inf>, %y: !kunir.ts<f32, inf
 }
 
 // CHECK-LABEL: kunir.func @test_multi_reduce
-// CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f64, inf>, %[[OUT0:.*]]: !kunir.ts<f64, 1>, %[[OUT1:.*]]: !kunir.ts<f64, 1>)
+// CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f64, inf>, %[[OUT0:.*]]: !kunir.ts<f64, inf>, %[[OUT1:.*]]: !kunir.ts<f64, inf>)
 kunir.func @test_multi_reduce(%input: !kunir.ts<f64, inf>)
     inputs {%input = "input"}
     outputs {"sum", "maxval"}
@@ -121,8 +123,8 @@ kunir.func @test_multi_reduce(%input: !kunir.ts<f64, inf>)
   // CHECK:          arith.addf
   // CHECK:          arith.maximumf
   // CHECK:          scf.yield {{.*}}, {{.*}} : f64, f64
-  // CHECK:        kungpu.ts.put %[[OUT0]], %[[R]]#0 : <f64, 1>, f64
-  // CHECK:        kungpu.ts.put %[[OUT1]], %[[R]]#1 : <f64, 1>, f64
+  // CHECK:        kungpu.ts.put %[[OUT0]], %[[R]]#0 : <f64, inf>, f64
+  // CHECK:        kungpu.ts.put %[[OUT1]], %[[R]]#1 : <f64, inf>, f64
   %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
   %sum, %max = kunir.for_each_back_window
       (%w : !kunir.ts<f64, 10>) [window = 10]
diff --git a/mlir/test/python/test_cs_rank_cuda.py b/mlir/test/python/test_cs_rank_cuda.py
index 0bd26eb..57c49c0 100644
--- a/mlir/test/python/test_cs_rank_cuda.py
+++ b/mlir/test/python/test_cs_rank_cuda.py
@@ -32,9 +32,10 @@
 
 from KunQuant.Op import Builder, Input, Output, Rank
 from KunQuant.ops import Add
+from KunQuant.Driver import KunCompilerConfig
 from KunQuant.Stage import Function
 from KunQuant.jit import KunMLIR
-from KunQuant.jit.cuda import compileit, CudaCompilerConfig, to_mlir
+from KunQuant.jit.cuda import compile_func, CudaCompilerConfig, to_mlir
 
 
 # ── CPU reference (matches cpp/Kun/Rank.hpp's equal_range formula) ──
@@ -116,13 +117,14 @@ def _run_cs_rank_only(target: str, dtype_token: str, T: int, S: int,
            f"nan={with_nan} ties={with_ties} ===")
 
     f = _build_cs_rank_only()
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS",
                               dtype=dtype_token)
-    mod = to_mlir(_build_cs_rank_only(), cfg)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    mod = to_mlir(_build_cs_rank_only(), kcfg, ccfg)
     print("--- mlir ---")
     print(mod.to_string())
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, kcfg, ccfg)
     print(f"  kernel_names={exe.kernel_names}  "
           f"num_buffers={exe.num_buffers}  "
           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
@@ -143,7 +145,7 @@ def _run_cs_rank_only(target: str, dtype_token: str, T: int, S: int,
     a_d = cp.asarray(a_h)
     out_d = cp.zeros((T, S), dtype=np_dt)
     ex = KunMLIR.Executor()
-    ex.runGraph(exe, {'a': a_d, 'r': out_d})
+    ex.runGraph(exe, inputs={'a': a_d}, outputs={'r': out_d})
     ex.synchronize()
     out_h = cp.asnumpy(out_d)
 
@@ -175,13 +177,14 @@ def _run_cs_rank_mixed(target: str, T: int, S: int, *, seed: int) -> int:
 
     print(f"=== cs_rank-mixed (float32) T={T} S={S} ===")
     f = _build_cs_rank_mixed()
-    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4,
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS",
                               dtype='float', partition_factor=1)
-    mod = to_mlir(_build_cs_rank_mixed(), cfg)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    mod = to_mlir(_build_cs_rank_mixed(), kcfg, ccfg)
     print("--- mlir ---")
     print(mod.to_string())
 
-    exe = compileit(f, cfg)
+    exe = compile_func(f, kcfg, ccfg)
     print(f"  kernel_names={exe.kernel_names}  "
           f"num_kernels={exe.num_kernels}  "
           f"num_buffers={exe.num_buffers}  "
@@ -198,9 +201,9 @@ def _run_cs_rank_mixed(target: str, T: int, S: int, *, seed: int) -> int:
     out_d = cp.zeros((T, S), dtype=cp.float32)
 
     ex = KunMLIR.Executor()
-    ex.runGraph(exe, {'a': cp.asarray(a_h),
-                       'b': cp.asarray(b_h),
-                       'out': out_d})
+    ex.runGraph(exe,
+                inputs={'a': cp.asarray(a_h), 'b': cp.asarray(b_h)},
+                outputs={'out': out_d})
     ex.synchronize()
     out_h = cp.asnumpy(out_d)
 
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index ebc64e5..2b31f96 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -116,7 +116,9 @@ def main() -> int:
         b   = cp.asarray(b_h)
         out = cp.zeros((T, S), dtype=cp.float32)
         executor = KunMLIR.Executor()
-        executor.runGraph(exe, {"a": a, "b": b, "sum": out})
+        executor.runGraph(exe,
+                          inputs={"a": a, "b": b},
+                          outputs={"sum": out})
         # No explicit synchronize: default-stream Executor + cupy's
         # default stream → cp.asnumpy's D2H memcpy goes on the same
         # stream and waits for our kernels.  See test_multi_kernel.py
diff --git a/mlir/test/python/test_validation_cuda.py b/mlir/test/python/test_validation_cuda.py
index ccdeafd..d1daf1c 100644
--- a/mlir/test/python/test_validation_cuda.py
+++ b/mlir/test/python/test_validation_cuda.py
@@ -27,6 +27,11 @@
 
 import numpy as np
 
+from KunQuant.Driver import KunCompilerConfig
+
+
+_KCFG_TS = KunCompilerConfig(input_layout="TS", output_layout="TS")
+
 
 # ── Fixture helpers ──────────────────────────────────────────────────
 
@@ -35,25 +40,25 @@ def _build_elemwise_exe(cfg):
     from KunQuant.Op import Builder, Input, Output
     from KunQuant.ops import Add
     from KunQuant.Stage import Function
-    from KunQuant.jit.cuda import compileit
+    from KunQuant.jit.cuda import compile_func
     b = Builder()
     with b:
         a = Input("a"); bb = Input("b")
         Output(Add(a, bb), "out")
     f = Function(b.ops, name="addk")
-    return compileit(f, cfg)
+    return compile_func(f, _KCFG_TS, cfg)
 
 
 def _build_cs_rank_exe(cfg):
     """cs_rank(a) → r.  Used for the smem-cap test."""
     from KunQuant.Op import Builder, Input, Output, Rank
     from KunQuant.Stage import Function
-    from KunQuant.jit.cuda import compileit
+    from KunQuant.jit.cuda import compile_func
     b = Builder()
     with b:
         Output(Rank(Input("a")), "r")
     f = Function(b.ops, name="csr")
-    return compileit(f, cfg)
+    return compile_func(f, _KCFG_TS, cfg)
 
 
 def _expect_fail(label, fn, needle):
@@ -92,7 +97,9 @@ def run_validation_tests(target):
     # 1. Object implementing neither CAI nor DLPack (a plain int)
     rc |= _expect_fail(
         "object without __dlpack__ rejected",
-        lambda: ex.runGraph(exe, {"a": 0xdeadbeef, "b": b, "out": out}),
+        lambda: ex.runGraph(exe,
+                            inputs={"a": 0xdeadbeef, "b": b},
+                            outputs={"out": out}),
         "does not implement __dlpack__")
 
     # 2. Host numpy array — numpy is a CPU-only producer; it refuses
@@ -102,29 +109,37 @@ def run_validation_tests(target):
     #    GPU launch.
     rc |= _expect_fail(
         "host numpy array rejected (CPU producer)",
-        lambda: ex.runGraph(exe, {"a": np.zeros((T, S), dtype=np.float32),
-                                    "b": b, "out": out}),
+        lambda: ex.runGraph(exe,
+                            inputs={"a": np.zeros((T, S), dtype=np.float32),
+                                    "b": b},
+                            outputs={"out": out}),
         "stream")
 
     # 3. Wrong dtype: float64
     rc |= _expect_fail(
         "f64 dtype rejected",
-        lambda: ex.runGraph(exe, {"a": cp.zeros((T, S), dtype=cp.float64),
-                                    "b": b, "out": out}),
-        "need float32")
+        lambda: ex.runGraph(exe,
+                            inputs={"a": cp.zeros((T, S), dtype=cp.float64),
+                                    "b": b},
+                            outputs={"out": out}),
+        "kernel expects float32")
 
     # 4. Wrong ndim: 1-D
     rc |= _expect_fail(
         "1-D array rejected",
-        lambda: ex.runGraph(exe, {"a": cp.zeros((T*S,), dtype=cp.float32),
-                                    "b": b, "out": out}),
+        lambda: ex.runGraph(exe,
+                            inputs={"a": cp.zeros((T*S,), dtype=cp.float32),
+                                    "b": b},
+                            outputs={"out": out}),
         "must be 2-D")
 
     # 5. Wrong ndim: 3-D
     rc |= _expect_fail(
         "3-D array rejected",
-        lambda: ex.runGraph(exe, {"a": cp.zeros((T, S, 1), dtype=cp.float32),
-                                    "b": b, "out": out}),
+        lambda: ex.runGraph(exe,
+                            inputs={"a": cp.zeros((T, S, 1), dtype=cp.float32),
+                                    "b": b},
+                            outputs={"out": out}),
         "must be 2-D")
 
     # 6. Non-contiguous strided view (transpose).  (T, S) and (S, T) are
@@ -134,29 +149,37 @@ def run_validation_tests(target):
     out_t = cp.zeros((S, T), dtype=cp.float32)
     rc |= _expect_fail(
         "non-contiguous transposed view rejected",
-        lambda: ex.runGraph(exe, {"a": a_t, "b": b_t, "out": out_t}),
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a_t, "b": b_t},
+                            outputs={"out": out_t}),
         "not C-contiguous")
 
-    # 7. Missing graph_output
+    # 7. Missing graph input.  Outputs may be omitted by design: the
+    #    binding auto-allocates them and returns the buffer dict.
     rc |= _expect_fail(
-        "missing graph_output rejected",
-        lambda: ex.runGraph(exe, {"a": a, "b": b}),    # no 'out'
-        "missing argument 'out'")
+        "missing graph_input rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a},
+                            outputs={"out": out}),
+        "missing input 'b'")
 
     # 8. Shape mismatch between args
     rc |= _expect_fail(
         "shape mismatch rejected",
-        lambda: ex.runGraph(exe, {"a": a,
-                                    "b": cp.zeros((T, S+1), dtype=cp.float32),
-                                    "out": out}),
-        "shape mismatch")
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a,
+                                    "b": cp.zeros((T, S+1), dtype=cp.float32)},
+                            outputs={"out": out}),
+        "expected")
 
     # 9. Unknown kwarg (the hot-path skip kicks in for size == ordered,
     #    so add a real extra to trip the strict check).
     rc |= _expect_fail(
         "unknown argument rejected",
-        lambda: ex.runGraph(exe, {"a": a, "b": b, "out": out, "bogus": a}),
-        "unexpected argument 'bogus'")
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a, "b": b, "bogus": a},
+                            outputs={"out": out}),
+        "unexpected input 'bogus'")
 
     # 10. DLPack-only producer — wrap a cupy ndarray and hide every
     #     attribute except __dlpack__ / __dlpack_device__.  Verifies the
@@ -172,7 +195,9 @@ def __dlpack_device__(self):
 
     print("  dlpack-only producer happy path ...", end=" ", flush=True)
     try:
-        ex.runGraph(exe, {"a": DLOnly(a), "b": DLOnly(b), "out": DLOnly(out)})
+        ex.runGraph(exe,
+                    inputs={"a": DLOnly(a), "b": DLOnly(b)},
+                    outputs={"out": DLOnly(out)})
         ex.synchronize()
         print("ok")
     except Exception as e:
@@ -210,7 +235,7 @@ def run_smem_cap_tests(target):
     out = cp.zeros((T, too_many), dtype=cp.float32)
     rc |= _expect_fail(
         "smem cap exceeded → clear error",
-        lambda: ex.runGraph(exe, {"a": a, "r": out}),
+        lambda: ex.runGraph(exe, inputs={"a": a}, outputs={"r": out}),
         "MAX_SHARED_MEMORY_PER_BLOCK_OPTIN")
 
     # At-cap case must still launch (off-by-one regression guard).
@@ -219,7 +244,7 @@ def run_smem_cap_tests(target):
     out2 = cp.zeros((T, at_limit), dtype=cp.float32)
     print(f"  at-cap launch (num_stocks={at_limit}) ...", end=" ", flush=True)
     try:
-        ex.runGraph(exe, {"a": a2, "r": out2})
+        ex.runGraph(exe, inputs={"a": a2}, outputs={"r": out2})
         ex.synchronize()
         print("ok")
     except Exception as e:
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index 9ccaef0..8db6034 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -116,7 +116,9 @@ def run_one(N: int, expected_placement: str, target: str,
     out = cp.zeros((T, S), dtype=cp.float32)
 
     executor = KunMLIR.Executor()
-    executor.runGraph(exe, {"a": a, "b": b, "out": out})
+    executor.runGraph(exe,
+                      inputs={"a": a, "b": b},
+                      outputs={"out": out})
     out_h = cp.asnumpy(out)            # implicitly waits via stream 0
 
     expected = reference_sum_window(a_h, b_h, N)

From 9069046cc6ca7c87b8bd929a6c269c10cfce0566 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 24 May 2026 20:47:29 -0700
Subject: [PATCH 49/59] enable dynlink

---
 mlir/Tools/kun-opt/CMakeLists.txt |  2 ++
 mlir/lib/KunGpu/CMakeLists.txt    |  5 ++++-
 mlir/lib/KunIr/CMakeLists.txt     |  6 +++++-
 mlir/lib/Python/CMakeLists.txt    | 11 +++++++----
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
index 9adc9de..cdc4590 100644
--- a/mlir/Tools/kun-opt/CMakeLists.txt
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -14,7 +14,9 @@ target_link_libraries(kun-opt PRIVATE
   MLIRKunIrDialect
   MLIRKunGpuDialect
   MLIRKunIrToKunGpu
+)
 
+mlir_target_link_libraries(kun-opt PRIVATE
   # MLIR opt infrastructure
   MLIROptLib
 
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
index b51d2f7..2950b8b 100644
--- a/mlir/lib/KunGpu/CMakeLists.txt
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -15,6 +15,10 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRKunIrOpsIncGen
 
   LINK_LIBS PUBLIC
+  MLIRKunIrDialect
+)
+
+mlir_target_link_libraries(MLIRKunGpuDialect PUBLIC
   MLIRIR
   MLIRPass
   MLIRFuncDialect
@@ -25,7 +29,6 @@ add_mlir_dialect_library(MLIRKunGpuDialect
   MLIRTransforms
   MLIRTransformUtils
   MLIRSideEffectInterfaces
-  MLIRKunIrDialect
   MLIRSCFToControlFlow
   MLIRControlFlowToLLVM
   MLIRArithToLLVM
diff --git a/mlir/lib/KunIr/CMakeLists.txt b/mlir/lib/KunIr/CMakeLists.txt
index 7bc4c0b..5e2877a 100644
--- a/mlir/lib/KunIr/CMakeLists.txt
+++ b/mlir/lib/KunIr/CMakeLists.txt
@@ -14,8 +14,9 @@ add_mlir_dialect_library(MLIRKunIrDialect
   MLIRKunIrOpsIncGen
   MLIRKunIrInterfacesIncGen
   MLIRKunIrAttrsIncGen
+)
 
-  LINK_LIBS PUBLIC
+mlir_target_link_libraries(MLIRKunIrDialect PUBLIC
   MLIRIR
   MLIRFuncDialect
   MLIRSideEffectInterfaces
@@ -37,6 +38,9 @@ add_mlir_library(MLIRKunIrToKunGpu
   LINK_LIBS PUBLIC
   MLIRKunIrDialect
   MLIRKunGpuDialect
+)
+
+mlir_target_link_libraries(MLIRKunIrToKunGpu PUBLIC
   MLIRFuncDialect
   MLIRArithDialect
   MLIRMathDialect
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index e301124..23a2401 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -44,6 +44,13 @@ target_link_libraries(KunMLIR PRIVATE
   MLIRKunGpuDialect
   MLIRKunIrToKunGpu
 
+  # Runtime side — owns cuda.h + libcuda; we just hand it ExecutableData
+  # and call launch().
+  KunCudaRuntime
+)
+
+mlir_target_link_libraries(KunMLIR PRIVATE
+  # Compiler side
   MLIRIR
   MLIRParser
   MLIRPass
@@ -74,8 +81,4 @@ target_link_libraries(KunMLIR PRIVATE
   # PtxBackend.cpp) + the NVVM target serializer it dispatches to.
   MLIRGPUTransforms
   MLIRNVVMTarget
-
-  # Runtime side — owns cuda.h + libcuda; we just hand it ExecutableData
-  # and call launch().
-  KunCudaRuntime
 )

From d0e63c9b0a3a17305936f3043c2ecec9d209b08e Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 24 May 2026 21:25:54 -0700
Subject: [PATCH 50/59] shared executable data

---
 mlir/include/KunCuda/Runtime.h        |  53 ++++++----
 mlir/lib/KunCuda/Runtime.cpp          | 142 +++++++++++++++++---------
 mlir/lib/KunCuda/RuntimeCudaGraph.cpp |   8 +-
 mlir/lib/KunCuda/RuntimeUtil.h        |  22 ++++
 mlir/lib/Python/MlirBinding.cpp       |  24 +++--
 mlir/test/python/test_kun_mlir.py     |  14 ++-
 6 files changed, 174 insertions(+), 89 deletions(-)

diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index 1278f6e..a070f7f 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -54,6 +54,11 @@ namespace kun_cuda {
 /// producer maps, etc.  Fully defined in Runtime.cpp.
 struct GraphPlan;
 
+/// Internal: context-local loaded CUDA modules, CUfunctions, and resolved
+/// graph plan.  Shared by cloned Executables; fully defined in the private
+/// runtime implementation.
+struct LoadedExecutable;
+
 /// Internal: CUDA Graph mode state.  Kept behind a pointer because normal
 /// launch mode does not need any graph objects.
 struct CudaGraphLaunchState;
@@ -166,13 +171,15 @@ struct ExecutableData {
 ///   6. cuModuleLoadData + cuModuleGetFunction × N on the calling
 ///      thread's primary CUDA context (which must already exist).
 ///
-/// Destruction calls `cuModuleUnload` and frees the slot pool.
+/// Destruction frees only this Executable's per-launch slot pool / CUDA graph
+/// state.  CUDA modules live in a shared LoadedExecutable and are unloaded
+/// when the last Executable sharing it is destroyed.
 class Executable {
 public:
   /// Throws std::runtime_error on driver errors, missing CUDA context,
-  /// or graph-validation failures.  Takes an rvalue — caller `std::move`s
-  /// the data in.
-  explicit Executable(ExecutableData &&data);
+  /// or graph-validation failures.  `ExecutableData` is immutable after
+  /// compile and may be shared by multiple Executables.
+  explicit Executable(std::shared_ptr<const ExecutableData> data);
   ~Executable();
 
   // Non-copyable, non-movable — wrap in unique_ptr / shared_ptr if you
@@ -183,17 +190,24 @@ class Executable {
   Executable &operator=(Executable &&)      = delete;
 
   // ── Accessors (compile-time data) ─────────────────────────────────
-  const ExecutableData &data() const noexcept { return data_; }
-  const std::vector<std::string> &graphInputs()  const noexcept { return data_.graphInputs; }
-  const std::vector<std::string> &graphOutputs() const noexcept { return data_.graphOutputs; }
-  int64_t warpsPerCta() const noexcept { return data_.warpsPerCta; }
-  int64_t vectorSize()  const noexcept { return data_.vectorSize; }
-  Datatype dtype()      const noexcept { return data_.dtype; }
-  size_t  numKernels()  const noexcept { return data_.kernels.size(); }
+  const ExecutableData &data() const noexcept { return *data_; }
+  std::shared_ptr<const ExecutableData> dataPtr() const noexcept {
+    return data_;
+  }
+  const std::vector<std::string> &graphInputs()  const noexcept { return data_->graphInputs; }
+  const std::vector<std::string> &graphOutputs() const noexcept { return data_->graphOutputs; }
+  int64_t warpsPerCta() const noexcept { return data_->warpsPerCta; }
+  int64_t vectorSize()  const noexcept { return data_->vectorSize; }
+  Datatype dtype()      const noexcept { return data_->dtype; }
+  size_t  numKernels()  const noexcept { return data_->kernels.size(); }
   const std::map<std::string, int64_t> &outputUnreliable() const noexcept {
-    return data_.outputUnreliable;
+    return data_->outputUnreliable;
   }
 
+  /// Create a new Executable with independent mutable launch state while
+  /// sharing immutable ExecutableData and loaded CUDA modules / functions.
+  std::unique_ptr<Executable> clone() const;
+
   // ── Accessors (runtime-resolved plan) ─────────────────────────────
   // Defined out-of-line so the header doesn't need GraphPlan's layout.
 
@@ -274,17 +288,12 @@ class Executable {
       double smFillFactor);
   void resetCudaGraphState() noexcept;
 
-  ExecutableData data_;
-  std::unique_ptr<GraphPlan> plan_;          ///< pImpl — defined in Runtime.cpp
-  std::unique_ptr<CudaGraphLaunchState> cudaGraphState_;
+  Executable(std::shared_ptr<const ExecutableData> data,
+             std::shared_ptr<LoadedExecutable> loaded);
 
-  CUmodule cuModule_ = nullptr;
-  /// Modules holding pre-compiled cross-sectional PTX.  Loaded at
-  /// construction time iff a matching external kernel is present; null
-  /// otherwise.
-  CUmodule csRankModule_ = nullptr;
-  CUmodule csScaleModule_ = nullptr;
-  std::vector<CUfunction> cuFuncs_;          ///< parallel to data_.kernels
+  std::shared_ptr<const ExecutableData> data_;
+  std::shared_ptr<LoadedExecutable> loaded_;
+  std::unique_ptr<CudaGraphLaunchState> cudaGraphState_;
 
   // Lazily allocated intermediate buffers, one CUdeviceptr per slot
   // (stored as uintptr_t to keep the header CUDA-free).
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
index dfeb071..218f6e6 100644
--- a/mlir/lib/KunCuda/Runtime.cpp
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -996,7 +996,13 @@ static void validateKernelIO(const std::vector<KernelMeta> &kernels,
 // Executable
 //===----------------------------------------------------------------------===//
 
-Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
+LoadedExecutable::LoadedExecutable(std::shared_ptr<const ExecutableData> dataIn)
+    : data(std::move(dataIn)) {
+  if (!data)
+    throw std::runtime_error(
+        "kun_cuda::LoadedExecutable: ExecutableData pointer is null");
+  const ExecutableData &d = *data;
+
   // Require a primary context to already exist on the calling thread —
   // the caller's job to set one up (e.g. by allocating any device memory
   // through cupy / cudaMalloc).
@@ -1004,59 +1010,99 @@ Executable::Executable(ExecutableData &&data) : data_(std::move(data)) {
   checkCu(cuCtxGetCurrent(&cur), "cuCtxGetCurrent");
   if (!cur)
     throw std::runtime_error(
-        "kun_cuda::Executable: no current CUDA context.  Initialise the "
+        "kun_cuda::LoadedExecutable: no current CUDA context.  Initialise the "
         "driver first (e.g. allocate any device memory via cupy or "
         "cudaMalloc) before constructing an Executable.");
-  if (data_.kernels.empty())
+  if (d.kernels.empty())
     throw std::runtime_error(
         "kun_cuda::Executable: ExecutableData has no kernels");
-  if (data_.graphInputs.empty())
+  if (d.graphInputs.empty())
     throw std::runtime_error(
         "kun_cuda::Executable: graph_inputs must be non-empty");
-  if (data_.graphOutputs.empty())
+  if (d.graphOutputs.empty())
     throw std::runtime_error(
         "kun_cuda::Executable: graph_outputs must be non-empty");
 
   // ── Build the runtime plan ───────────────────────────────────────
-  BufTable tbl  = buildBufferIndices(data_.graphInputs, data_.graphOutputs,
-                                       data_.kernels);
-  KernelIO kio  = resolveKernelIO(data_.kernels, tbl);
-  validateGraph(data_.kernels, data_.graphOutputs, tbl, kio);
-  std::vector<int> order = topoSort(kio, static_cast<int>(data_.kernels.size()));
+  BufTable tbl  = buildBufferIndices(d.graphInputs, d.graphOutputs,
+                                       d.kernels);
+  KernelIO kio  = resolveKernelIO(d.kernels, tbl);
+  validateGraph(d.kernels, d.graphOutputs, tbl, kio);
+  std::vector<int> order = topoSort(kio, static_cast<int>(d.kernels.size()));
   SlotPlan slots = planSlots(order, tbl, kio);
 
-  plan_ = std::make_unique<GraphPlan>();
-  plan_->numBuffers          = tbl.numBuffers;
-  plan_->numGraphInputs      = tbl.numGraphInputs;
-  plan_->numGraphOutputs     = tbl.numGraphOutputs;
-  plan_->graphInputIdx       = std::move(tbl.graphInputIdx);
-  plan_->graphOutputIdx      = std::move(tbl.graphOutputIdx);
-  plan_->kernelInputBufs     = std::move(kio.kernelInputBufs);
-  plan_->kernelOutputBufs    = std::move(kio.kernelOutputBufs);
-  plan_->producerKernel      = std::move(kio.producerKernel);
-  plan_->launchOrder         = std::move(order);
-  plan_->intermediateBufToSlot = std::move(slots.intermediateBufToSlot);
-  plan_->peakIntermediateSlots = slots.peakIntermediateSlots;
+  plan.numBuffers          = tbl.numBuffers;
+  plan.numGraphInputs      = tbl.numGraphInputs;
+  plan.numGraphOutputs     = tbl.numGraphOutputs;
+  plan.graphInputIdx       = std::move(tbl.graphInputIdx);
+  plan.graphOutputIdx      = std::move(tbl.graphOutputIdx);
+  plan.kernelInputBufs     = std::move(kio.kernelInputBufs);
+  plan.kernelOutputBufs    = std::move(kio.kernelOutputBufs);
+  plan.producerKernel      = std::move(kio.producerKernel);
+  plan.launchOrder         = std::move(order);
+  plan.intermediateBufToSlot = std::move(slots.intermediateBufToSlot);
+  plan.peakIntermediateSlots = slots.peakIntermediateSlots;
 
   // ── Per-kernel I/O arity validation ──────────────────────────────
   // Catches mis-wired external kernels (which have a fixed signature)
   // at construction time, well before the launch path.
-  validateKernelIO(data_.kernels,
-                    plan_->kernelInputBufs, plan_->kernelOutputBufs);
+  validateKernelIO(d.kernels, plan.kernelInputBufs, plan.kernelOutputBufs);
 
   // ── Load cubin(s) + resolve every kernel symbol ──────────────────
-  loadJitCubin(data_, cuModule_);
-  loadExternalCsPtxIfNeeded(data_.kernels, csRankModule_, csScaleModule_);
-
-  cuFuncs_.resize(data_.kernels.size(), nullptr);
-  for (size_t i = 0; i < data_.kernels.size(); ++i) {
-    cuFuncs_[i] = resolveOneKernelSymbol(data_.kernels[i],
-                                          cuModule_, csRankModule_,
-                                          csScaleModule_);
+  try {
+    loadJitCubin(d, cuModule);
+    loadExternalCsPtxIfNeeded(d.kernels, csRankModule, csScaleModule);
+
+    cuFuncs.resize(d.kernels.size(), nullptr);
+    for (size_t i = 0; i < d.kernels.size(); ++i) {
+      cuFuncs[i] = resolveOneKernelSymbol(d.kernels[i],
+                                          cuModule, csRankModule,
+                                          csScaleModule);
+    }
+
+    // ── Opt external kernels into the device's full dynamic smem cap ──
+    optInExternalSmemMax(d.kernels, cuFuncs);
+  } catch (...) {
+    if (cuModule)
+      cuModuleUnload(cuModule);
+    if (csRankModule)
+      cuModuleUnload(csRankModule);
+    if (csScaleModule)
+      cuModuleUnload(csScaleModule);
+    cuModule = nullptr;
+    csRankModule = nullptr;
+    csScaleModule = nullptr;
+    throw;
   }
+}
+
+LoadedExecutable::~LoadedExecutable() noexcept {
+  if (cuModule)
+    cuModuleUnload(cuModule);
+  if (csRankModule)
+    cuModuleUnload(csRankModule);
+  if (csScaleModule)
+    cuModuleUnload(csScaleModule);
+}
+
+Executable::Executable(std::shared_ptr<const ExecutableData> data)
+    : data_(std::move(data)),
+      loaded_(std::make_shared<LoadedExecutable>(data_)) {}
+
+Executable::Executable(std::shared_ptr<const ExecutableData> data,
+                       std::shared_ptr<LoadedExecutable> loaded)
+    : data_(std::move(data)), loaded_(std::move(loaded)) {
+  if (!data_)
+    throw std::runtime_error(
+        "kun_cuda::Executable: ExecutableData pointer is null");
+  if (!loaded_)
+    throw std::runtime_error(
+        "kun_cuda::Executable: LoadedExecutable pointer is null");
+}
 
-  // ── Opt external kernels into the device's full dynamic smem cap ──
-  optInExternalSmemMax(data_.kernels, cuFuncs_);
+std::unique_ptr<Executable> Executable::clone() const {
+  return std::unique_ptr<Executable>(
+      new Executable(data_, loaded_));
 }
 
 Executable::~Executable() {
@@ -1064,12 +1110,6 @@ Executable::~Executable() {
   // out of a destructor.
   resetCudaGraphState();
   freeSlotPool();
-  if (cuModule_)
-    cuModuleUnload(cuModule_);
-  if (csRankModule_)
-    cuModuleUnload(csRankModule_);
-  if (csScaleModule_)
-    cuModuleUnload(csScaleModule_);
 }
 
 void Executable::freeSlotPool() {
@@ -1082,19 +1122,19 @@ void Executable::freeSlotPool() {
 
 void Executable::ensureSlotPool(int64_t timeLength, int64_t numStocks) {
   if (timeLength == cachedT_ && numStocks == cachedS_ &&
-      static_cast<int>(slotBufs_.size()) == plan_->peakIntermediateSlots)
+      static_cast<int>(slotBufs_.size()) == loaded_->plan.peakIntermediateSlots)
     return;
   freeSlotPool();
-  if (plan_->peakIntermediateSlots == 0) {
+  if (loaded_->plan.peakIntermediateSlots == 0) {
     cachedT_ = timeLength;
     cachedS_ = numStocks;
     return;
   }
   size_t bytesPerSlot = static_cast<size_t>(timeLength) *
                           static_cast<size_t>(numStocks) *
-                          bytesPerElem(data_.dtype);
-  slotBufs_.resize(plan_->peakIntermediateSlots, 0);
-  for (int i = 0; i < plan_->peakIntermediateSlots; ++i) {
+                          bytesPerElem(data_->dtype);
+  slotBufs_.resize(loaded_->plan.peakIntermediateSlots, 0);
+  for (int i = 0; i < loaded_->plan.peakIntermediateSlots; ++i) {
     CUdeviceptr p = 0;
     checkCu(cuMemAlloc(&p, bytesPerSlot), "cuMemAlloc(intermediate slot)");
     slotBufs_[i] = static_cast<uintptr_t>(p);
@@ -1108,11 +1148,11 @@ void Executable::ensureSlotPool(int64_t timeLength, int64_t numStocks) {
 //===----------------------------------------------------------------------===//
 
 const std::vector<int> &Executable::launchOrder() const noexcept {
-  return plan_->launchOrder;
+  return loaded_->plan.launchOrder;
 }
-int Executable::numBuffers() const noexcept { return plan_->numBuffers; }
+int Executable::numBuffers() const noexcept { return loaded_->plan.numBuffers; }
 int Executable::peakIntermediateSlots() const noexcept {
-  return plan_->peakIntermediateSlots;
+  return loaded_->plan.peakIntermediateSlots;
 }
 
 void Executable::launchOnStream(
@@ -1127,7 +1167,7 @@ void Executable::launchOnStream(
     throw std::runtime_error(
         "kun_cuda::launchOnStream: Executor pointer is null");
 
-  validateLaunchInputs(data_, timeLength, numStocks, mask);
+  validateLaunchInputs(*data_, timeLength, numStocks, mask);
 
   if (mode == LaunchMode::CudaGraph) {
     launchCudaGraphOnStream(exec, timeLength, numStocks, args,
@@ -1144,10 +1184,10 @@ void Executable::launchOnStream(
 
   // ── Map user args + slot pool into a flat buffer-index → ptr ─────
   const std::vector<uintptr_t> bufPtrs =
-      resolveBufferPointers(*plan_, data_, args, slotBufs_);
+      resolveBufferPointers(loaded_->plan, *data_, args, slotBufs_);
 
   std::vector<KernelLaunchDesc> descs =
-      buildKernelLaunchDescs(*plan_, data_, cuFuncs_,
+      buildKernelLaunchDescs(loaded_->plan, *data_, loaded_->cuFuncs,
                              timeLength, numStocks, bufPtrs,
                              mask, minChunkWarmupFactor, smFillFactor,
                              devMaxSmemBytes, numSMs);
diff --git a/mlir/lib/KunCuda/RuntimeCudaGraph.cpp b/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
index 8fcb285..8162aaa 100644
--- a/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
+++ b/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
@@ -341,16 +341,18 @@ void Executable::launchCudaGraphOnStream(
     ensureNoInFlight(*cudaGraphState_, "rebuilding");
     resetCudaGraphState();
     cudaGraphState_ = std::make_unique<CudaGraphLaunchState>();
-    buildCudaGraphState(*plan_, data_, cuFuncs_, *cudaGraphState_,
+    buildCudaGraphState(loaded_->plan, *data_, loaded_->cuFuncs,
+                        *cudaGraphState_,
                         exec, timeLength, numStocks, args,
                         mask, minChunkWarmupFactor, smFillFactor);
   } else {
     ensureNoInFlight(*cudaGraphState_, "updating");
     CudaGraphLaunchParams launch = makeLaunchParams(
-        *plan_, data_, *cudaGraphState_, exec, timeLength, numStocks, args,
+        loaded_->plan, *data_, *cudaGraphState_, exec, timeLength, numStocks, args,
         mask, minChunkWarmupFactor, smFillFactor);
     if (!sameLaunchParams(*cudaGraphState_->cachedLaunchParams, launch)) {
-      updateCudaGraphKernelParams(*plan_, data_, cuFuncs_, *cudaGraphState_,
+      updateCudaGraphKernelParams(loaded_->plan, *data_, loaded_->cuFuncs,
+                                  *cudaGraphState_,
                                   *cudaGraphState_->cachedLaunchParams,
                                   launch);
       cudaGraphState_->cachedLaunchParams = std::move(launch);
diff --git a/mlir/lib/KunCuda/RuntimeUtil.h b/mlir/lib/KunCuda/RuntimeUtil.h
index a8606c9..c827883 100644
--- a/mlir/lib/KunCuda/RuntimeUtil.h
+++ b/mlir/lib/KunCuda/RuntimeUtil.h
@@ -12,6 +12,7 @@
 #include <cuda.h>
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <unordered_map>
@@ -46,6 +47,27 @@ struct GraphPlan {
   int peakIntermediateSlots = 0;
 };
 
+// Context-local immutable runtime state shared by cloned Executables.
+// Per-Executable mutable state (intermediate slot buffers and CUDA Graph
+// launch cache) intentionally stays on Executable.
+struct LoadedExecutable {
+  explicit LoadedExecutable(std::shared_ptr<const ExecutableData> data);
+  ~LoadedExecutable() noexcept;
+
+  LoadedExecutable(const LoadedExecutable &) = delete;
+  LoadedExecutable &operator=(const LoadedExecutable &) = delete;
+  LoadedExecutable(LoadedExecutable &&) = delete;
+  LoadedExecutable &operator=(LoadedExecutable &&) = delete;
+
+  std::shared_ptr<const ExecutableData> data;
+  GraphPlan plan;
+
+  CUmodule cuModule = nullptr;
+  CUmodule csRankModule = nullptr;
+  CUmodule csScaleModule = nullptr;
+  std::vector<CUfunction> cuFuncs;  ///< parallel to data->kernels
+};
+
 struct ChunkPlan {
   int64_t chunkSize = 0;
   unsigned numChunks = 1;
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index ffa6fc6..12afda2 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -416,17 +416,17 @@ pyCompile(PyModule &pm,
   opts.optLevel    = optLevel;
   opts.toolkitPath = toolkitPath;
 
-  kun_cuda::ExecutableData data;
-  if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), opts, data)))
+  auto data = std::make_shared<kun_cuda::ExecutableData>();
+  if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), opts, *data)))
     throw std::runtime_error("KunMLIR.compile failed");
 
   // Append external (pre-compiled, runtime-dispatched) kernels.  The
   // MLIR pipeline never saw them; they're fabricated here from the
   // descriptor list the Python frontend collected.
   for (nb::handle obj : externalKernels)
-    data.kernels.push_back(parseExternalKernel(obj));
+    data->kernels.push_back(parseExternalKernel(obj));
 
-  if (data.kernels.empty())
+  if (data->kernels.empty())
     throw std::runtime_error(
         "KunMLIR.compile: no kernels (neither MLIR-emitted nor "
         "external) — refusing to build an empty Executable");
@@ -437,24 +437,24 @@ pyCompile(PyModule &pm,
   // kernels they fix warpsPerCta via their kungpu.target_spec, and we
   // trust that over the parameter (and ignore the parameter).
   bool anyJit = false;
-  for (const auto &k : data.kernels)
+  for (const auto &k : data->kernels)
     if (k.kind == kun_cuda::KernelKind::Jit) { anyJit = true; break; }
   if (!anyJit) {
     if (warpsPerCta <= 0)
       throw std::runtime_error(
           "KunMLIR.compile: warps_per_cta must be positive when every "
           "kernel is external; got " + std::to_string(warpsPerCta));
-    data.warpsPerCta = warpsPerCta;
+    data->warpsPerCta = warpsPerCta;
   }
 
   // Graph topology is a runtime concern — fill it in here, just before
   // handing off to Executable's ctor (which validates + plans).
-  data.graphInputs  = graphInputs;
-  data.graphOutputs = graphOutputs;
+  data->graphInputs  = graphInputs;
+  data->graphOutputs = graphOutputs;
   for (auto item : outputUnreliable) {
     auto name = nb::cast<std::string>(item.first);
     auto val  = nb::cast<int64_t>(item.second);
-    data.outputUnreliable[name] = val;
+    data->outputUnreliable[name] = val;
   }
   return std::make_unique<kun_cuda::Executable>(std::move(data));
 }
@@ -529,6 +529,12 @@ NB_MODULE(KunMLIR, m) {
               const auto &b = e.data().cubin;
               return nb::bytes(b.data(), b.size());
             })
+      .def("clone",
+            [](const kun_cuda::Executable &e) {
+              return e.clone();
+            },
+            "Return a new Executable with independent launch state while "
+            "sharing immutable compile data and loaded CUDA modules.")
       .def("getOutputUnreliableCount",
             &kun_cuda::Executable::outputUnreliable,
             nb::rv_policy::reference_internal,
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 2b31f96..1c4a9f1 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -93,6 +93,11 @@ def main() -> int:
     assert exe.warps_per_cta == 4
     assert exe.vector_size   == 1
 
+    clone = exe.clone()
+    assert clone.kernel_names == exe.kernel_names
+    assert clone.input_names  == exe.input_names
+    assert clone.output_names == exe.output_names
+
     # Run the kernel for two num_stocks values:
     #  - one that's a multiple of (warps_per_cta * 32 * vector_size) — no
     #    tail block;
@@ -101,9 +106,10 @@ def main() -> int:
     block_x = exe.warps_per_cta * 32 * exe.vector_size
     rng = np.random.default_rng(0)
     rc = 0
-    for label, S in [("aligned", args.num_stocks),
-                      ("unaligned (tail block)",
-                       args.num_stocks + (block_x // 2 + 7))]:
+    for run_exe, label, S in [
+            (exe, "aligned", args.num_stocks),
+            (clone, "unaligned clone (tail block)",
+             args.num_stocks + (block_x // 2 + 7))]:
         T = args.time_length
         print()
         is_aligned = (S % block_x == 0)
@@ -116,7 +122,7 @@ def main() -> int:
         b   = cp.asarray(b_h)
         out = cp.zeros((T, S), dtype=cp.float32)
         executor = KunMLIR.Executor()
-        executor.runGraph(exe,
+        executor.runGraph(run_exe,
                           inputs={"a": a, "b": b},
                           outputs={"sum": out})
         # No explicit synchronize: default-stream Executor + cupy's

From d328fd925f0a80cbb76e94154348b456ae917711 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Sun, 24 May 2026 23:20:56 -0700
Subject: [PATCH 51/59] dump to file

---
 .gitmodules                         |   3 +
 3rdparty/nlohmann_json              |   1 +
 mlir/include/KunCuda/Runtime.h      |  16 +-
 mlir/lib/KunCuda/CMakeLists.txt     |  12 +-
 mlir/lib/KunCuda/ExecutableData.cpp | 449 ++++++++++++++++++++++++++++
 mlir/lib/Python/MlirBinding.cpp     |  19 +-
 mlir/test/python/test_kun_mlir.py   |  25 +-
 7 files changed, 515 insertions(+), 10 deletions(-)
 create mode 160000 3rdparty/nlohmann_json
 create mode 100644 mlir/lib/KunCuda/ExecutableData.cpp

diff --git a/.gitmodules b/.gitmodules
index 5557432..463cb32 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "3rdparty/nanobind"]
 	path = 3rdparty/nanobind
 	url = https://github.com/wjakob/nanobind
+[submodule "3rdparty/nlohmann_json"]
+	path = 3rdparty/nlohmann_json
+	url = https://github.com/nlohmann/json
diff --git a/3rdparty/nlohmann_json b/3rdparty/nlohmann_json
new file mode 160000
index 0000000..484483a
--- /dev/null
+++ b/3rdparty/nlohmann_json
@@ -0,0 +1 @@
+Subproject commit 484483acad6d562306efc9b3c6d413404f1b1f8a
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
index a070f7f..7c82037 100644
--- a/mlir/include/KunCuda/Runtime.h
+++ b/mlir/include/KunCuda/Runtime.h
@@ -35,9 +35,9 @@
 #pragma once
 
 #include <cstdint>
-#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -150,7 +150,16 @@ struct ExecutableData {
   /// leading time steps of each Output buffer to skip.  Populated by
   /// the Python frontend (which has the pre-partition `infer_window`
   /// snapshot); empty when not supplied.
-  std::map<std::string, int64_t> outputUnreliable;
+  std::unordered_map<std::string, int64_t> outputUnreliable;
+
+  /// Write this artifact as `<dir>/<name>.json` plus `<dir>/<name>.cubin`.
+  /// The JSON metadata stores only the sibling cubin filename, never an
+  /// arbitrary cubin path.
+  void saveToFiles(const std::string &dir, const std::string &name) const;
+
+  /// Load `<dir>/<name>.json` and `<dir>/<name>.cubin` into a new data object.
+  static std::shared_ptr<ExecutableData>
+  loadFromFiles(const std::string &dir, const std::string &name);
 };
 
 //===----------------------------------------------------------------------===//
@@ -200,7 +209,8 @@ class Executable {
   int64_t vectorSize()  const noexcept { return data_->vectorSize; }
   Datatype dtype()      const noexcept { return data_->dtype; }
   size_t  numKernels()  const noexcept { return data_->kernels.size(); }
-  const std::map<std::string, int64_t> &outputUnreliable() const noexcept {
+  const std::unordered_map<std::string, int64_t> &
+  outputUnreliable() const noexcept {
     return data_->outputUnreliable;
   }
 
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 9b2374b..4fc23d0 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -1,7 +1,5 @@
-# libKunCudaRuntime.so — pure CUDA runtime, decoupled from the MLIR
-# compiler library and the Python binding.  Holds the ExecutableData
-# struct, the Executable class (cuModuleLoadData / cuLaunchKernel) and
-# nothing else.
+# libKunCudaRuntime.so — CUDA runtime, decoupled from the MLIR compiler
+# library and the Python binding.
 
 # ── Locate the CUDA toolkit ──────────────────────────────────────────
 # Standard CMake CUDA discovery.  Honours, in order:
@@ -95,6 +93,7 @@ foreach(_cu IN LISTS _kun_cu_sources)
 endforeach()
 
 add_library(KunCudaRuntime SHARED
+    ExecutableData.cpp
     Runtime.cpp
     RuntimeCudaGraph.cpp
     ${_kun_ptx_embed_includes})
@@ -120,6 +119,11 @@ target_include_directories(KunCudaRuntime PUBLIC
 target_include_directories(KunCudaRuntime PRIVATE
     "${CMAKE_CURRENT_BINARY_DIR}")
 
+# ExecutableData.cpp uses nlohmann/json for artifact metadata.  This is a
+# header-only third-party dependency kept private to the runtime library.
+target_include_directories(KunCudaRuntime SYSTEM PRIVATE
+    "${PROJECT_SOURCE_DIR}/3rdparty/nlohmann_json/include")
+
 # CUDA Driver API.  `CUDA::cuda_driver` is FindCUDAToolkit's imported
 # target wrapping `lib64/stubs/libcuda.so` with the right INCLUDE
 # INTERFACE — gets cuda.h + the link-time stub in one go.  This dep
diff --git a/mlir/lib/KunCuda/ExecutableData.cpp b/mlir/lib/KunCuda/ExecutableData.cpp
new file mode 100644
index 0000000..b47c333
--- /dev/null
+++ b/mlir/lib/KunCuda/ExecutableData.cpp
@@ -0,0 +1,449 @@
+//===- ExecutableData.cpp - serialize kun_cuda executable artifacts -------===//
+//
+// Stores ExecutableData as a JSON metadata file plus a sibling cubin binary.
+// The public API is name-based: callers provide only a directory and artifact
+// name, and the implementation owns the `<name>.json` / `<name>.cubin`
+// convention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunCuda/Runtime.h"
+
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace kun_cuda {
+namespace {
+
+using json = nlohmann::ordered_json;
+namespace fs = std::filesystem;
+
+constexpr const char *kFormat = "kun_cuda_executable_data";
+constexpr int64_t kVersion = 1;
+
+static void validateArtifactName(const std::string &name) {
+  if (name.empty())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: artifact name must be non-empty");
+  if (name.find('/') != std::string::npos ||
+      name.find('\\') != std::string::npos)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: artifact name must not contain path "
+        "separators");
+}
+
+static std::string joinArtifactPath(const std::string &dir,
+                                    const std::string &fileName) {
+  fs::path path(dir);
+  path /= fileName;
+  return path.string();
+}
+
+static std::string jsonFileName(const std::string &name) {
+  return name + ".json";
+}
+
+static std::string cubinFileName(const std::string &name) {
+  return name + ".cubin";
+}
+
+static void ensureDirectory(const std::string &dir) {
+  if (dir.empty())
+    return;
+
+  std::error_code ec;
+  fs::create_directories(dir, ec);
+  if (ec)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to create directory '" + dir +
+        "': " + ec.message());
+}
+
+static void writeTextFile(const std::string &path, const std::string &text) {
+  std::ofstream os(path, std::ios::binary);
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for writing");
+  os.write(text.data(), static_cast<std::streamsize>(text.size()));
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to write '" + path + "'");
+}
+
+static void writeBinaryFile(const std::string &path,
+                            const std::vector<char> &bytes) {
+  std::ofstream os(path, std::ios::binary);
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for writing");
+  if (!bytes.empty())
+    os.write(bytes.data(), static_cast<std::streamsize>(bytes.size()));
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to write '" + path + "'");
+}
+
+static std::string readTextFile(const std::string &path) {
+  std::ifstream is(path, std::ios::binary);
+  if (!is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for reading");
+  std::string text((std::istreambuf_iterator<char>(is)),
+                   std::istreambuf_iterator<char>());
+  if (!is.eof() && !is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to read '" + path + "'");
+  return text;
+}
+
+static std::vector<char> readBinaryFile(const std::string &path) {
+  std::ifstream is(path, std::ios::binary);
+  if (!is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for reading");
+  std::vector<char> bytes((std::istreambuf_iterator<char>(is)),
+                          std::istreambuf_iterator<char>());
+  if (!is.eof() && !is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to read '" + path + "'");
+  return bytes;
+}
+
+static const char *toString(Datatype dtype) {
+  switch (dtype) {
+  case Datatype::Float:
+    return "f32";
+  case Datatype::Double:
+    return "f64";
+  }
+  throw std::runtime_error("kun_cuda::ExecutableData: unknown datatype");
+}
+
+static Datatype parseDatatype(const std::string &text,
+                              const std::string &jsonPath) {
+  if (text == "f32")
+    return Datatype::Float;
+  if (text == "f64")
+    return Datatype::Double;
+  throw std::runtime_error(
+      "kun_cuda::ExecutableData: unsupported dtype '" + text +
+      "' in '" + jsonPath + "'");
+}
+
+static const char *toString(KernelKind kind) {
+  switch (kind) {
+  case KernelKind::Jit:
+    return "jit";
+  case KernelKind::ExtCsRankF32:
+    return "ext_cs_rank_f32";
+  case KernelKind::ExtCsRankF64:
+    return "ext_cs_rank_f64";
+  case KernelKind::ExtCsScaleF32:
+    return "ext_cs_scale_f32";
+  case KernelKind::ExtCsScaleF64:
+    return "ext_cs_scale_f64";
+  }
+  throw std::runtime_error("kun_cuda::ExecutableData: unknown kernel kind");
+}
+
+static KernelKind parseKernelKind(const std::string &text,
+                                  const std::string &jsonPath,
+                                  const std::string &fieldPath) {
+  if (text == "jit")
+    return KernelKind::Jit;
+  if (text == "ext_cs_rank_f32")
+    return KernelKind::ExtCsRankF32;
+  if (text == "ext_cs_rank_f64")
+    return KernelKind::ExtCsRankF64;
+  if (text == "ext_cs_scale_f32")
+    return KernelKind::ExtCsScaleF32;
+  if (text == "ext_cs_scale_f64")
+    return KernelKind::ExtCsScaleF64;
+  throw std::runtime_error(
+      "kun_cuda::ExecutableData: unsupported kernel kind '" + text +
+      "' at " + fieldPath + " in '" + jsonPath + "'");
+}
+
+static const json &requireObject(const json &value,
+                                 const std::string &jsonPath,
+                                 const std::string &fieldPath) {
+  if (!value.is_object())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected object at " + fieldPath +
+        " in '" + jsonPath + "'");
+  return value;
+}
+
+static const json &requireField(const json &object,
+                                const std::string &jsonPath,
+                                const std::string &fieldPath,
+                                const char *fieldName) {
+  requireObject(object, jsonPath, fieldPath);
+  auto it = object.find(fieldName);
+  if (it == object.end())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: missing field " + fieldPath + "." +
+        fieldName + " in '" + jsonPath + "'");
+  return *it;
+}
+
+static std::string getString(const json &value,
+                             const std::string &jsonPath,
+                             const std::string &fieldPath) {
+  if (!value.is_string())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected string at " + fieldPath +
+        " in '" + jsonPath + "'");
+  return value.get<std::string>();
+}
+
+static int64_t getInt64(const json &value,
+                        const std::string &jsonPath,
+                        const std::string &fieldPath) {
+  if (!value.is_number_integer())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected integer at " + fieldPath +
+        " in '" + jsonPath + "'");
+  return value.get<int64_t>();
+}
+
+static std::vector<std::string>
+getStringArray(const json &value, const std::string &jsonPath,
+               const std::string &fieldPath) {
+  if (!value.is_array())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected array at " + fieldPath +
+        " in '" + jsonPath + "'");
+
+  std::vector<std::string> result;
+  result.reserve(value.size());
+  for (size_t i = 0; i < value.size(); ++i)
+    result.push_back(getString(value[i], jsonPath,
+                               fieldPath + "[" + std::to_string(i) + "]"));
+  return result;
+}
+
+static std::unordered_map<std::string, int64_t>
+getStringIntMap(const json &value, const std::string &jsonPath,
+                const std::string &fieldPath) {
+  if (!value.is_object())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected object at " + fieldPath +
+        " in '" + jsonPath + "'");
+
+  std::unordered_map<std::string, int64_t> result;
+  for (auto it = value.begin(); it != value.end(); ++it) {
+    const std::string path = fieldPath + "." + it.key();
+    result.emplace(it.key(), getInt64(it.value(), jsonPath, path));
+  }
+  return result;
+}
+
+static json toJSONArray(const std::vector<std::string> &strings) {
+  json array = json::array();
+  for (const std::string &s : strings)
+    array.push_back(s);
+  return array;
+}
+
+static json toJSON(const KernelMeta &kernel) {
+  json obj = json::object();
+  obj["name"] = kernel.kernelName;
+  obj["kind"] = toString(kernel.kind);
+  obj["inputs"] = toJSONArray(kernel.inputNames);
+  obj["outputs"] = toJSONArray(kernel.outputNames);
+  obj["unreliable_count"] = kernel.unreliableCount;
+  return obj;
+}
+
+static KernelMeta parseKernelMeta(const json &value,
+                                  const std::string &jsonPath,
+                                  const std::string &fieldPath) {
+  requireObject(value, jsonPath, fieldPath);
+
+  KernelMeta kernel;
+  kernel.kernelName =
+      getString(requireField(value, jsonPath, fieldPath, "name"),
+                jsonPath, fieldPath + ".name");
+  const std::string kind =
+      getString(requireField(value, jsonPath, fieldPath, "kind"),
+                jsonPath, fieldPath + ".kind");
+  kernel.kind = parseKernelKind(kind, jsonPath, fieldPath + ".kind");
+  kernel.inputNames =
+      getStringArray(requireField(value, jsonPath, fieldPath, "inputs"),
+                     jsonPath, fieldPath + ".inputs");
+  kernel.outputNames =
+      getStringArray(requireField(value, jsonPath, fieldPath, "outputs"),
+                     jsonPath, fieldPath + ".outputs");
+  kernel.unreliableCount =
+      getInt64(requireField(value, jsonPath, fieldPath, "unreliable_count"),
+               jsonPath, fieldPath + ".unreliable_count");
+  return kernel;
+}
+
+struct Metadata {
+  std::string format;
+  int64_t version = 0;
+  std::string cubin;
+  int64_t warpsPerCta = 1;
+  int64_t vectorSize = 1;
+  std::string dtype;
+  std::vector<KernelMeta> kernels;
+  std::vector<std::string> graphInputs;
+  std::vector<std::string> graphOutputs;
+  std::unordered_map<std::string, int64_t> outputUnreliable;
+};
+
+static json metadataToJSON(const ExecutableData &data,
+                           const std::string &cubinName) {
+  json kernels = json::array();
+  for (const KernelMeta &kernel : data.kernels)
+    kernels.push_back(toJSON(kernel));
+
+  std::vector<std::string> outputNames;
+  outputNames.reserve(data.outputUnreliable.size());
+  for (const auto &item : data.outputUnreliable)
+    outputNames.push_back(item.first);
+  std::sort(outputNames.begin(), outputNames.end());
+
+  json outputUnreliable = json::object();
+  for (const std::string &name : outputNames)
+    outputUnreliable[name] = data.outputUnreliable.at(name);
+
+  json obj = json::object();
+  obj["format"] = kFormat;
+  obj["version"] = kVersion;
+  obj["cubin"] = cubinName;
+  obj["warps_per_cta"] = data.warpsPerCta;
+  obj["vector_size"] = data.vectorSize;
+  obj["dtype"] = toString(data.dtype);
+  obj["kernels"] = std::move(kernels);
+  obj["graph_inputs"] = toJSONArray(data.graphInputs);
+  obj["graph_outputs"] = toJSONArray(data.graphOutputs);
+  obj["output_unreliable"] = std::move(outputUnreliable);
+  return obj;
+}
+
+static Metadata parseMetadata(const std::string &jsonPath,
+                              const std::string &jsonText) {
+  json root;
+  try {
+    root = json::parse(jsonText);
+  } catch (const json::parse_error &e) {
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to parse '" + jsonPath +
+        "': " + e.what());
+  }
+
+  requireObject(root, jsonPath, "$");
+
+  Metadata metadata;
+  metadata.format =
+      getString(requireField(root, jsonPath, "$", "format"),
+                jsonPath, "$.format");
+  metadata.version =
+      getInt64(requireField(root, jsonPath, "$", "version"),
+               jsonPath, "$.version");
+  metadata.cubin =
+      getString(requireField(root, jsonPath, "$", "cubin"),
+                jsonPath, "$.cubin");
+  metadata.warpsPerCta =
+      getInt64(requireField(root, jsonPath, "$", "warps_per_cta"),
+               jsonPath, "$.warps_per_cta");
+  metadata.vectorSize =
+      getInt64(requireField(root, jsonPath, "$", "vector_size"),
+               jsonPath, "$.vector_size");
+  metadata.dtype =
+      getString(requireField(root, jsonPath, "$", "dtype"),
+                jsonPath, "$.dtype");
+  metadata.graphInputs =
+      getStringArray(requireField(root, jsonPath, "$", "graph_inputs"),
+                     jsonPath, "$.graph_inputs");
+  metadata.graphOutputs =
+      getStringArray(requireField(root, jsonPath, "$", "graph_outputs"),
+                     jsonPath, "$.graph_outputs");
+  metadata.outputUnreliable =
+      getStringIntMap(requireField(root, jsonPath, "$", "output_unreliable"),
+                      jsonPath, "$.output_unreliable");
+
+  const json &kernelsValue = requireField(root, jsonPath, "$", "kernels");
+  if (!kernelsValue.is_array())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected array at $.kernels in '" +
+        jsonPath + "'");
+  metadata.kernels.reserve(kernelsValue.size());
+  for (size_t i = 0; i < kernelsValue.size(); ++i)
+    metadata.kernels.push_back(
+        parseKernelMeta(kernelsValue[i], jsonPath,
+                        "$.kernels[" + std::to_string(i) + "]"));
+
+  return metadata;
+}
+
+} // namespace
+
+void ExecutableData::saveToFiles(const std::string &dir,
+                                 const std::string &name) const {
+  validateArtifactName(name);
+  ensureDirectory(dir);
+
+  const std::string jsonName = jsonFileName(name);
+  const std::string cubinName = cubinFileName(name);
+  const std::string jsonPath = joinArtifactPath(dir, jsonName);
+  const std::string cubinPath = joinArtifactPath(dir, cubinName);
+
+  writeBinaryFile(cubinPath, cubin);
+  writeTextFile(jsonPath, metadataToJSON(*this, cubinName).dump(2) + "\n");
+}
+
+std::shared_ptr<ExecutableData>
+ExecutableData::loadFromFiles(const std::string &dir,
+                              const std::string &name) {
+  validateArtifactName(name);
+
+  const std::string jsonName = jsonFileName(name);
+  const std::string cubinName = cubinFileName(name);
+  const std::string jsonPath = joinArtifactPath(dir, jsonName);
+  const std::string cubinPath = joinArtifactPath(dir, cubinName);
+
+  Metadata metadata = parseMetadata(jsonPath, readTextFile(jsonPath));
+  if (metadata.format != kFormat)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: unsupported metadata format '" +
+        metadata.format + "' in '" + jsonPath + "'");
+  if (metadata.version != kVersion)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: unsupported metadata version " +
+        std::to_string(metadata.version) + " in '" + jsonPath + "'");
+  if (metadata.cubin != cubinName)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: metadata cubin field in '" + jsonPath +
+        "' must be '" + cubinName + "'");
+
+  auto data = std::make_shared<ExecutableData>();
+  data->cubin = readBinaryFile(cubinPath);
+  data->warpsPerCta = metadata.warpsPerCta;
+  data->vectorSize = metadata.vectorSize;
+  data->dtype = parseDatatype(metadata.dtype, jsonPath);
+  data->kernels = std::move(metadata.kernels);
+  data->graphInputs = std::move(metadata.graphInputs);
+  data->graphOutputs = std::move(metadata.graphOutputs);
+  data->outputUnreliable = std::move(metadata.outputUnreliable);
+  return data;
+}
+
+} // namespace kun_cuda
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
index 12afda2..7a4f279 100644
--- a/mlir/lib/Python/MlirBinding.cpp
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -16,8 +16,8 @@
 
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>
-#include <nanobind/stl/map.h>
 #include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>
 #include <nanobind/stl/unique_ptr.h>
 
@@ -459,6 +459,12 @@ pyCompile(PyModule &pm,
   return std::make_unique<kun_cuda::Executable>(std::move(data));
 }
 
+static std::unique_ptr<kun_cuda::Executable>
+pyLoadExecutable(const std::string &dir, const std::string &name) {
+  return std::make_unique<kun_cuda::Executable>(
+      kun_cuda::ExecutableData::loadFromFiles(dir, name));
+}
+
 } // namespace
 
 NB_MODULE(KunMLIR, m) {
@@ -491,6 +497,10 @@ NB_MODULE(KunMLIR, m) {
          "compile path goes straight to cubin.");
 
   nb::class_<kun_cuda::Executable>(m, "Executable")
+      .def_static("load_from_files", &pyLoadExecutable,
+            nb::arg("dir"), nb::arg("name"),
+            "Load an Executable from `<dir>/<name>.json` and "
+            "`<dir>/<name>.cubin`.")
       .def_prop_ro("input_names",   &kun_cuda::Executable::graphInputs,
             "Graph-level input names — match this against the keys of the "
             "args dict you pass to launch().")
@@ -535,6 +545,13 @@ NB_MODULE(KunMLIR, m) {
             },
             "Return a new Executable with independent launch state while "
             "sharing immutable compile data and loaded CUDA modules.")
+      .def("save_to_files",
+            [](const kun_cuda::Executable &e, const std::string &dir,
+               const std::string &name) {
+              e.data().saveToFiles(dir, name);
+            },
+            nb::arg("dir"), nb::arg("name"),
+            "Write `<dir>/<name>.json` and `<dir>/<name>.cubin`.")
       .def("getOutputUnreliableCount",
             &kun_cuda::Executable::outputUnreliable,
             nb::rv_policy::reference_internal,
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index 1c4a9f1..c771b08 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -10,8 +10,11 @@
 
 from __future__ import annotations
 import argparse
+import json
 import sys
+import tempfile
 import textwrap
+from pathlib import Path
 
 
 SAMPLE_KUNIR = textwrap.dedent("""
@@ -98,7 +101,24 @@ def main() -> int:
     assert clone.input_names  == exe.input_names
     assert clone.output_names == exe.output_names
 
-    # Run the kernel for two num_stocks values:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exe.save_to_files(tmpdir, "addsum")
+        metadata_path = Path(tmpdir) / "addsum.json"
+        cubin_path = Path(tmpdir) / "addsum.cubin"
+        assert metadata_path.exists()
+        assert cubin_path.exists()
+        with metadata_path.open("r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        assert metadata["format"] == "kun_cuda_executable_data"
+        assert metadata["version"] == 1
+        assert metadata["cubin"] == "addsum.cubin"
+        loaded = KunMLIR.Executable.load_from_files(tmpdir, "addsum")
+    assert loaded.kernel_names == exe.kernel_names
+    assert loaded.input_names  == exe.input_names
+    assert loaded.output_names == exe.output_names
+
+    # Run original, cloned, and file-loaded executables over two num_stocks
+    # values:
     #  - one that's a multiple of (warps_per_cta * 32 * vector_size) — no
     #    tail block;
     #  - one that isn't — exercises the active-thread guard inserted by
@@ -109,7 +129,8 @@ def main() -> int:
     for run_exe, label, S in [
             (exe, "aligned", args.num_stocks),
             (clone, "unaligned clone (tail block)",
-             args.num_stocks + (block_x // 2 + 7))]:
+             args.num_stocks + (block_x // 2 + 7)),
+            (loaded, "loaded from files", args.num_stocks)]:
         T = args.time_length
         print()
         is_aligned = (S % block_x == 0)

From fb8c8cb2e031fbf740e0a3da253e0fb5374dd9c5 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 25 May 2026 00:44:42 -0700
Subject: [PATCH 52/59] lit driven tests

---
 CMakeLists.txt                           | 13 ++++-
 KunQuant/jit/env.py                      | 62 ++++++++++++++++++++++++
 mlir/lib/KunCuda/CMakeLists.txt          | 24 ++-------
 mlir/lib/Python/CMakeLists.txt           |  9 +---
 mlir/test/lit.cfg.py                     | 58 +++++++++++++++++++++-
 mlir/test/lit.site.cfg.py.in             |  4 ++
 mlir/test/python/lit.local.cfg           |  2 +
 mlir/test/python/test_cs_rank_cuda.py    |  6 ++-
 mlir/test/python/test_kun_mlir.py        | 24 ++++++---
 mlir/test/python/test_kun_to_cuda.py     |  6 ++-
 mlir/test/python/test_multi_kernel.py    | 13 +++--
 mlir/test/python/test_validation_cuda.py |  6 ++-
 mlir/test/python/test_windowed_temp.py   | 10 +++-
 mlir/test/python/utils.py                | 21 ++++++++
 14 files changed, 212 insertions(+), 46 deletions(-)
 create mode 100644 mlir/test/python/lit.local.cfg
 create mode 100644 mlir/test/python/utils.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8374b2b..bc68754 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,9 +116,20 @@ add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
 option(KUN_BUILD_MLIR "Build MLIR backend with kunir/kungpu dialects" OFF)
 if(KUN_BUILD_MLIR)
   find_package(MLIR REQUIRED CONFIG)
+  find_package(CUDAToolkit REQUIRED)
   message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
   message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
+  if(NOT CMAKE_CUDA_COMPILER)
+    set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}" CACHE FILEPATH
+        "nvcc used by CMake's CUDA-language support.")
+  endif()
+  get_filename_component(KUN_CUDA_TOOLKIT_ROOT
+                         "${CUDAToolkit_BIN_DIR}" DIRECTORY)
+  message(STATUS
+      "KunQuant MLIR CUDA toolkit = ${KUN_CUDA_TOOLKIT_ROOT} "
+      "(version ${CUDAToolkit_VERSION})")
+
   set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/bin)
   set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/lib)
   set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
@@ -132,4 +143,4 @@ if(KUN_BUILD_MLIR)
   include(HandleLLVMOptions)
 
   add_subdirectory(mlir)
-endif()
\ No newline at end of file
+endif()
diff --git a/KunQuant/jit/env.py b/KunQuant/jit/env.py
index d0975c3..c40cc6d 100644
--- a/KunQuant/jit/env.py
+++ b/KunQuant/jit/env.py
@@ -104,3 +104,65 @@ def get_compiler_env():
                 print("Reset env", "PATH+=", extra_path, "INCLUDE=", env['INCLUDE'], "LIB=", env['LIB'])
     _env = env
     return env
+
+
+def _format_cuda_sm(major: int, minor: int) -> str:
+    return f"sm_{int(major)}{int(minor)}"
+
+
+def _format_cuda_sm_from_capability(capability) -> str:
+    if isinstance(capability, tuple):
+        if len(capability) != 2:
+            raise ValueError(f"unexpected CUDA capability tuple: {capability!r}")
+        return _format_cuda_sm(capability[0], capability[1])
+
+    text = str(capability).strip().lower()
+    if text.startswith("sm_"):
+        text = text[3:]
+    text = text.replace(".", "")
+    if not text or not text.isdigit():
+        raise ValueError(f"unexpected CUDA capability value: {capability!r}")
+    return f"sm_{text}"
+
+
+def get_cuda_compute_capability() -> str:
+    """Return the current CUDA device architecture as `sm_xx`.
+
+    CuPy is preferred because the CUDA JIT path already uses CuPy arrays in
+    tests and examples.  PyTorch is used as a fallback when CuPy is not
+    available or cannot query a CUDA device.
+    """
+    errors = []
+
+    try:
+        import cupy as cp
+        dev = cp.cuda.Device()
+        capability = getattr(dev, "compute_capability", None)
+        if capability is not None:
+            return _format_cuda_sm_from_capability(capability)
+
+        props = cp.cuda.runtime.getDeviceProperties(dev.id)
+        return _format_cuda_sm(props["major"], props["minor"])
+    except Exception as e:
+        errors.append(f"cupy: {type(e).__name__}: {e}")
+
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            raise RuntimeError("torch.cuda.is_available() is false")
+        major, minor = torch.cuda.get_device_capability()
+        return _format_cuda_sm(major, minor)
+    except Exception as e:
+        errors.append(f"torch: {type(e).__name__}: {e}")
+
+    raise RuntimeError(
+        "Could not determine CUDA compute capability from CuPy or PyTorch: "
+        + "; ".join(errors))
+
+
+def has_cuda_device() -> bool:
+    try:
+        get_cuda_compute_capability()
+        return True
+    except RuntimeError:
+        return False
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index 4fc23d0..f289d2f 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -1,22 +1,9 @@
 # libKunCudaRuntime.so — CUDA runtime, decoupled from the MLIR compiler
 # library and the Python binding.
 
-# ── Locate the CUDA toolkit ──────────────────────────────────────────
-# Standard CMake CUDA discovery.  Honours, in order:
-#   1. -DCUDAToolkit_ROOT=<path> on the cmake command line
-#   2. $CUDAToolkit_ROOT, $CUDA_PATH, $CUDA_HOME env vars
-#   3. Standard install locations (/usr/local/cuda, …)
-# See the CMake docs for FindCUDAToolkit — same module the rest of the
-# ecosystem uses, no custom validation needed.
-#
-# We then point CMake's CUDA-language support at the same nvcc the
-# find_package result exposed, so `enable_language(CUDA)` doesn't pick
-# up a different toolkit off PATH.
-find_package(CUDAToolkit REQUIRED)
-if(NOT CMAKE_CUDA_COMPILER)
-  set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}" CACHE FILEPATH
-      "nvcc used by CMake's CUDA-language support.")
-endif()
+# ── CUDA language setup ──────────────────────────────────────────────
+# The root CMakeLists discovers CUDAToolkit once when KUN_BUILD_MLIR is
+# enabled.  Reuse its CMAKE_CUDA_COMPILER / CUDA::cuda_driver target here.
 enable_language(CUDA)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -26,11 +13,6 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 # the PTX via driver-JIT (cached system-wide in ~/.nv/ComputeCache).
 set(CMAKE_CUDA_ARCHITECTURES 75)
 
-get_filename_component(_kun_toolkit_root "${CUDAToolkit_BIN_DIR}" DIRECTORY)
-message(STATUS
-    "KunCudaRuntime: CUDA toolkit = ${_kun_toolkit_root} "
-    "(version ${CUDAToolkit_VERSION})")
-
 # ── Recipe: compile `.cu` → `.ptx` → embed as a C array ──────────────
 # Drop a new `kernels/<name>.cu` and the foreach below picks it up —
 # the kernel becomes available to Runtime.cpp via
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index 23a2401..4e3aef5 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -10,11 +10,6 @@
 string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
 string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
 
-# Re-run FindCUDAToolkit here so CUDA::cuda_driver is an imported target
-# in this directory's scope.  KunCuda already calls it but the IMPORTED
-# target it creates is scoped to that subdirectory.
-find_package(CUDAToolkit REQUIRED)
-
 # STABLE_ABI: single .abi3.so on CPython ≥ 3.12; falls back to per-version
 # on older Pythons.  Matches the runner binding (cpp/Python).
 nanobind_add_module(KunMLIR STABLE_ABI
@@ -35,8 +30,8 @@ set_target_properties(KunMLIR PROPERTIES
 target_link_libraries(KunMLIR PRIVATE
   # cuda.h + libcuda stub — the binding's runGraph wrapper calls
   # cuMemAlloc / cuMemFree directly to back caller-omitted output
-  # buffers.  KunCudaRuntime links cuda_driver PRIVATE so we have to
-  # repeat the dependency here.
+  # buffers.  The CUDA::cuda_driver target is provided by the root
+  # KUN_BUILD_MLIR CUDAToolkit discovery.
   CUDA::cuda_driver
 
   # Compiler side
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index df3c65b..9aba20b 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -1,18 +1,72 @@
 import os
+import subprocess
 import lit.formats
 
 config.name = "KunQuant MLIR Tests"
 config.test_format = lit.formats.ShTest(True)
-config.suffixes = [".mlir"]
+config.suffixes = [".mlir", ".py"]
 
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = config.obj_root
 
+def prepend_env(name, entries):
+    entries = [entry for entry in entries if entry]
+    old = config.environment.get(name, "")
+    if old:
+        entries.append(old)
+    config.environment[name] = os.pathsep.join(entries)
+
+# Python GPU tests import the in-tree KunQuant package and load the freshly
+# built extension module from KunQuant/jit.
+prepend_env("PYTHONPATH", [config.project_source_dir])
+
+# KunMLIR.abi3.so links against the downloaded LLVM/MLIR shared libraries.
+# The CUDA toolkit path is also made explicit so both CuPy and the MLIR
+# libdevice/ptxas discovery use the same installation as CMake.
+config.environment["CUDA_PATH"] = config.cuda_toolkit_root
+config.environment["CUDA_HOME"] = config.cuda_toolkit_root
+prepend_env("PATH", [os.path.join(config.cuda_toolkit_root, "bin")])
+prepend_env("LD_LIBRARY_PATH", [
+    config.llvm_lib_dir,
+    os.path.join(config.cuda_toolkit_root, "lib"),
+    os.path.join(config.cuda_toolkit_root, "lib64"),
+    os.path.join(config.cuda_toolkit_root, "lib64", "stubs"),
+])
+
+def detect_cuda_device():
+    try:
+        result = subprocess.run(
+            [config.python_executable, "-c",
+             "from KunQuant.jit.env import get_cuda_compute_capability; "
+             "print(get_cuda_compute_capability())"],
+            env=config.environment,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=20,
+            check=True)
+        arch = result.stdout.strip()
+        if arch:
+            lit_config.note("CUDA device detected for Python tests: " + arch)
+        return True
+    except Exception as exc:
+        lit_config.note("No CUDA device detected for Python tests: " + str(exc))
+        return False
+
+if detect_cuda_device():
+    config.available_features.add("cuda-device")
+
 # Tool substitutions
 config.substitutions.append(("%kun-opt", config.kun_opt))
+config.substitutions.append(("%python", config.python_executable))
 config.substitutions.append(
     ("%FileCheck", os.path.join(config.llvm_tools_dir, "FileCheck"))
 )
 
 # Exclude non-test directories from discovery
-config.excludes = ["CMakeLists.txt", "lit.cfg.py", "lit.site.cfg.py.in"]
+config.excludes = [
+    "CMakeLists.txt",
+    "lit.cfg.py",
+    "lit.site.cfg.py.in",
+    "utils.py",
+]
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index af0d5be..b43cd5f 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -6,5 +6,9 @@ config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_BINARY_DIR@")
 config.mlir_tools_dir = lit_config.substitute("@MLIR_TOOLS_DIR@")
 config.kun_opt = lit_config.substitute("@KUN_OPT_BINARY@")
 config.obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.project_source_dir = lit_config.substitute("@PROJECT_SOURCE_DIR@")
+config.python_executable = lit_config.substitute("@PYTHON_EXECUTABLE@")
+config.llvm_lib_dir = lit_config.substitute("@LLVM_LIBRARY_DIRS@")
+config.cuda_toolkit_root = lit_config.substitute("@KUN_CUDA_TOOLKIT_ROOT@")
 
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
diff --git a/mlir/test/python/lit.local.cfg b/mlir/test/python/lit.local.cfg
new file mode 100644
index 0000000..ee28185
--- /dev/null
+++ b/mlir/test/python/lit.local.cfg
@@ -0,0 +1,2 @@
+lit_config.parallelism_groups["kun_cuda_python"] = 1
+config.parallelism_group = "kun_cuda_python"
diff --git a/mlir/test/python/test_cs_rank_cuda.py b/mlir/test/python/test_cs_rank_cuda.py
index 57c49c0..fa3a26c 100644
--- a/mlir/test/python/test_cs_rank_cuda.py
+++ b/mlir/test/python/test_cs_rank_cuda.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
 """End-to-end test for the cs_rank GPU dispatch path.
 
 Cross-sectional rank (`KunQuant.Op.Rank`) is special on the GPU: it
@@ -222,12 +224,14 @@ def _run_cs_rank_mixed(target: str, T: int, S: int, *, seed: int) -> int:
 
 def main() -> int:
     ap = argparse.ArgumentParser()
-    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("--target", default=None)
     ap.add_argument("-T", "--time-length", type=int, default=8)
     ap.add_argument("-S", "--num-stocks", type=int, default=257)
     args = ap.parse_args()
 
     import cupy as cp
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
 
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
index c771b08..2486630 100644
--- a/mlir/test/python/test_kun_mlir.py
+++ b/mlir/test/python/test_kun_mlir.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python3
+# RUN: %python %s
 """End-to-end test for the `KunMLIR` Python bindings.
 
   parse → to_string → lower_to_ptx (debug only) → compile → launch
 
 Usage:
     PATH=$CUDA_BIN:$PATH PYTHONPATH=<build>/mlir/lib/Python \
-        kun python test_kun_mlir.py [--target sm_120]
+        kun python test_kun_mlir.py [--target sm_xx]
 """
 
 from __future__ import annotations
@@ -33,21 +34,18 @@
 
 def main() -> int:
     ap = argparse.ArgumentParser()
-    ap.add_argument("--target", default="sm_120",
+    ap.add_argument("--target", default=None,
                      help="GPU compute capability (e.g. sm_120, sm_90, sm_80)")
     ap.add_argument("-T", "--time-length", type=int, default=64)
     ap.add_argument("-S", "--num-stocks", type=int, default=2048)
     args = ap.parse_args()
 
     from KunQuant.jit import KunMLIR
-    import cupy as cp
     import numpy as np
     from KunQuant.jit.cuda import find_cuda_toolkit
+    from utils import resolve_cuda_compute_capability
 
-    # Force-initialise the CUDA driver + create the primary context now,
-    # so subsequent KunMLIR.compile() / Executor.runGraph() find one.
-    cp.cuda.Device(0).use()
-    _ = cp.zeros((1,), dtype=cp.float32)
+    args.target, has_cuda_device = resolve_cuda_compute_capability(args.target)
 
     print(f"=== parse + to_string ===")
     mod = KunMLIR.parse(SAMPLE_KUNIR)
@@ -68,6 +66,18 @@ def main() -> int:
     assert "test_addsum" in ptx
     print(f"ok — produced {len(ptx)} bytes of PTX text")
 
+    if not has_cuda_device:
+        print()
+        print("skip — no CUDA device is visible; skipping Executable "
+              "construction and runGraph checks")
+        return 0
+
+    import cupy as cp
+    # Force-initialise the CUDA driver + create the primary context now,
+    # so subsequent KunMLIR.compile() / Executor.runGraph() find one.
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
     print()
     print(f"=== compile (all-in-one) ===")
     mod2 = KunMLIR.parse(SAMPLE_KUNIR)
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
index 6831e07..bc31a87 100644
--- a/mlir/test/python/test_kun_to_cuda.py
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
 """End-to-end test for the KunQuant Python-IR → MLIR → CUDA path.
 
 Builds a KunQuant Function with the high-level Op API, runs the same
@@ -740,7 +742,7 @@ def run_library(target: str, T: int, S: int) -> int:
 
 def main() -> int:
     ap = argparse.ArgumentParser()
-    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("--target", default=None)
     # Defaults sized to comfortably trigger multi-chunk: T=128 with
     # warmup=5 (N) gives `cap_warmup = 128/(4*5) = 6` chunks; S=1024
     # gives `stock_tiles = 1024/(4*32) = 8`, so even on a small GPU
@@ -751,6 +753,8 @@ def main() -> int:
     args = ap.parse_args()
 
     import cupy as cp
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
 
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
index b87c1ad..a4cea7a 100644
--- a/mlir/test/python/test_multi_kernel.py
+++ b/mlir/test/python/test_multi_kernel.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# RUN: %python %s
+# RUN: %python %s --use-cuda-graph
+# REQUIRES: cuda-device
 """End-to-end test for the v0 multi-kernel pipeline.
 
 Builds a graph with two kernels chained through one intermediate buffer:
@@ -47,16 +50,20 @@
 
 def main() -> int:
     ap = argparse.ArgumentParser()
-    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("--target", default=None)
     ap.add_argument("-T", "--time-length", type=int, default=64)
     ap.add_argument("-S", "--num-stocks", type=int, default=2048)
     ap.add_argument("--use-cuda-graph", action="store_true")
     args = ap.parse_args()
 
     from KunQuant.jit import KunMLIR
-    import cupy as cp
     from KunQuant.jit.cuda import find_cuda_toolkit
+    from KunQuant.jit.env import get_cuda_compute_capability
+
+    args.target = args.target or get_cuda_compute_capability()
+    toolkit = find_cuda_toolkit()
 
+    import cupy as cp
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
 
@@ -66,7 +73,7 @@ def main() -> int:
                             graph_inputs=["a", "b", "c"],
                             graph_outputs=["out"],
                             gpu_arch=args.target, opt_level=3,
-                            toolkit_path=find_cuda_toolkit())
+                            toolkit_path=toolkit)
 
     print(f"  kernel_names           = {exe.kernel_names}")
     print(f"  num_kernels            = {exe.num_kernels}")
diff --git a/mlir/test/python/test_validation_cuda.py b/mlir/test/python/test_validation_cuda.py
index d1daf1c..e516e4b 100644
--- a/mlir/test/python/test_validation_cuda.py
+++ b/mlir/test/python/test_validation_cuda.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
 """Negative tests for the KunMLIR launch-time validation path.
 
 The runtime consumes every input/output via DLPack (the protocol
@@ -257,10 +259,12 @@ def run_smem_cap_tests(target):
 
 def main() -> int:
     ap = argparse.ArgumentParser()
-    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("--target", default=None)
     args = ap.parse_args()
 
     import cupy as cp
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
 
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
index 8db6034..8b83483 100644
--- a/mlir/test/python/test_windowed_temp.py
+++ b/mlir/test/python/test_windowed_temp.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
 """End-to-end test for the windowed_temp lowering across both placements
 the memory-planning pass can choose:
 
@@ -89,7 +91,6 @@ def run_one(N: int, expected_placement: str, target: str,
               warps_per_cta: int = 4, smem_size: int = 49152,
               T: int = 64, S: int = 2048) -> int:
     from KunQuant.jit import KunMLIR
-    import cupy as cp
     from KunQuant.jit.cuda import find_cuda_toolkit
 
     print(f"=== N = {N}  ({expected_placement} temp buffer) ===")
@@ -97,6 +98,7 @@ def run_one(N: int, expected_placement: str, target: str,
 
     ir = build_ir(N, warps_per_cta=warps_per_cta, smem_size=smem_size)
     mod = KunMLIR.parse(ir)
+
     exe = KunMLIR.compile(mod,
                             graph_inputs=["a", "b"],
                             graph_outputs=["out"],
@@ -105,6 +107,7 @@ def run_one(N: int, expected_placement: str, target: str,
     print(f"  kernels={exe.kernel_names}  warps_per_cta={exe.warps_per_cta}  "
            f"vector_size={exe.vector_size}  cubin={len(exe.cubin)} bytes")
 
+    import cupy as cp
     # Random input.  T must be > N so we have at least one valid window.
     if T <= N:
         T = N + 32
@@ -143,11 +146,14 @@ def run_one(N: int, expected_placement: str, target: str,
 
 def main() -> int:
     ap = argparse.ArgumentParser(description=__doc__)
-    ap.add_argument("--target", default="sm_120")
+    ap.add_argument("--target", default=None)
     ap.add_argument("-T", "--time-length", type=int, default=64)
     ap.add_argument("-S", "--num-stocks", type=int, default=2048)
     args = ap.parse_args()
 
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
+
     import cupy as cp
     cp.cuda.Device(0).use()
     _ = cp.zeros((1,), dtype=cp.float32)
diff --git a/mlir/test/python/utils.py b/mlir/test/python/utils.py
new file mode 100644
index 0000000..603f82e
--- /dev/null
+++ b/mlir/test/python/utils.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from typing import Optional, Tuple
+
+from KunQuant.jit.env import get_cuda_compute_capability
+
+
+def resolve_cuda_compute_capability(explicit_target: Optional[str] = None,
+                                    fallback: str = "sm_80"
+                                    ) -> Tuple[str, bool]:
+    """Return `(gpu_arch, has_device)` for MLIR Python tests.
+
+    Tests that can still cover compile-only behavior without a visible GPU use
+    this helper to select a conservative fallback architecture and decide
+    whether to skip runtime `Executable` / `runGraph` checks.
+    """
+    try:
+        detected = get_cuda_compute_capability()
+        return explicit_target or detected, True
+    except RuntimeError:
+        return explicit_target or fallback, False

From 360f67ad3a6f299c3f6a887beca07bf4ec7d716f Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 25 May 2026 01:22:32 -0700
Subject: [PATCH 53/59] enable ci

---
 .github/workflows/ccpp.yml | 84 +++++++++++++++++++++++++++++++++++++-
 mlir/test/CMakeLists.txt   |  1 +
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 7852876..45186da 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -110,4 +110,86 @@ jobs:
     - name: Alpha158 test
       working-directory: ./
       run: |
-        python ./tests/test_alpha158.py --inputs ./input.npz --ref ./alpha158.npz --action run_avx2
\ No newline at end of file
+        python ./tests/test_alpha158.py --inputs ./input.npz --ref ./alpha158.npz --action run_avx2
+  cuda-mlir:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+        cache: 'pip'
+    - name: Install CUDA 13.2 and build dependencies
+      run: |
+        set -eux
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+          build-essential \
+          ca-certificates \
+          cmake \
+          curl \
+          git \
+          libxml2-dev \
+          libzstd-dev \
+          ninja-build \
+          pkg-config \
+          zlib1g-dev
+        curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin \
+          -o /tmp/cuda-ubuntu2404.pin
+        sudo install -m 0644 /tmp/cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600
+        curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404-keyring.gpg \
+          -o /tmp/cuda-archive-keyring.gpg
+        sudo install -m 0644 /tmp/cuda-archive-keyring.gpg /usr/share/keyrings/cuda-archive-keyring.gpg
+        echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /" \
+          | sudo tee /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends cuda-toolkit-13-2
+        sudo ln -sf libcuda.so /usr/local/cuda-13.2/lib64/stubs/libcuda.so.1
+        /usr/local/cuda-13.2/bin/nvcc --version
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install numpy==1.26.4 lit cupy-cuda13x
+        lit --version
+    - name: Read LLVM tag
+      id: llvm_tag
+      run: |
+        tag="$(sed -e 's/#.*//' -e '/^[[:space:]]*$/d' mlir/llvm_commit.txt | head -n1 | tr -d '[:space:]')"
+        test -n "$tag"
+        echo "tag=$tag" >> "$GITHUB_OUTPUT"
+        echo "LLVM tag: $tag"
+    - name: Download prebuilt LLVM/MLIR
+      env:
+        LLVM_TAG: ${{ steps.llvm_tag.outputs.tag }}
+      run: |
+        set -eux
+        mkdir -p "$RUNNER_TEMP/llvm-mlir"
+        curl -fL --retry 3 \
+          "https://github.com/Menooker/KunQuant/releases/download/llvm-mlir-${LLVM_TAG}/llvm-mlir-install-static-${LLVM_TAG}.tar.gz" \
+          -o "$RUNNER_TEMP/llvm-mlir.tar.gz"
+        tar -xzf "$RUNNER_TEMP/llvm-mlir.tar.gz" -C "$RUNNER_TEMP/llvm-mlir" --strip-components=1
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/mlir/MLIRConfig.cmake"
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/llvm/LLVMConfig.cmake"
+        echo "LLVM_PREFIX=$RUNNER_TEMP/llvm-mlir" >> "$GITHUB_ENV"
+    - name: Configure MLIR backend
+      env:
+        CUDA_PATH: /usr/local/cuda-13.2
+        CUDA_HOME: /usr/local/cuda-13.2
+      run: |
+        cmake -S . -B build/mlir-ci -G Ninja \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DKUN_BUILD_MLIR=ON \
+          -DLLVM_DIR="$LLVM_PREFIX/lib/cmake/llvm" \
+          -DMLIR_DIR="$LLVM_PREFIX/lib/cmake/mlir" \
+          -DCUDAToolkit_ROOT=/usr/local/cuda-13.2 \
+          -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc \
+          -DPython_EXECUTABLE="$(python -c 'import sys; print(sys.executable)')" \
+          -DPYTHON_EXECUTABLE="$(python -c 'import sys; print(sys.executable)')" \
+          -DLLVM_EXTERNAL_LIT="$(command -v lit)"
+    - name: Run KunQuant MLIR tests
+      env:
+        CUDA_PATH: /usr/local/cuda-13.2
+        CUDA_HOME: /usr/local/cuda-13.2
+      run: cmake --build build/mlir-ci --target check-kun-mlir --parallel 4
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 212d030..eff8412 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -16,6 +16,7 @@ configure_lit_site_cfg(
 
 set(KUN_MLIR_TEST_DEPENDS
   kun-opt
+  KunMLIR
   FileCheck
 )
 

From 1f6127c6e4fedb828c3fecb1f67009818c9a8b66 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 25 May 2026 02:16:42 -0700
Subject: [PATCH 54/59] cuda cache

---
 .github/workflows/ccpp.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 45186da..0818769 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -121,7 +121,7 @@ jobs:
       with:
         python-version: '3.12'
         cache: 'pip'
-    - name: Install CUDA 13.2 and build dependencies
+    - name: Install build dependencies
       run: |
         set -eux
         sudo apt-get update
@@ -136,6 +136,16 @@ jobs:
           ninja-build \
           pkg-config \
           zlib1g-dev
+    - name: Cache CUDA 13.2 toolkit
+      id: cache-cuda
+      uses: actions/cache@v4
+      with:
+        path: .cache/cuda-13.2
+        key: ${{ runner.os }}-${{ runner.arch }}-cuda-toolkit-13.2-v1
+    - name: Install CUDA 13.2
+      if: steps.cache-cuda.outputs.cache-hit != 'true'
+      run: |
+        set -eux
         curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin \
           -o /tmp/cuda-ubuntu2404.pin
         sudo install -m 0644 /tmp/cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600
@@ -146,6 +156,15 @@ jobs:
           | sudo tee /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends cuda-toolkit-13-2
+        mkdir -p .cache
+        sudo tar -C /usr/local -cf "$RUNNER_TEMP/cuda-13.2.tar" cuda-13.2
+        tar -C .cache -xf "$RUNNER_TEMP/cuda-13.2.tar"
+    - name: Prepare CUDA 13.2
+      run: |
+        set -eux
+        if [ ! -d /usr/local/cuda-13.2 ]; then
+          sudo ln -s "$GITHUB_WORKSPACE/.cache/cuda-13.2" /usr/local/cuda-13.2
+        fi
         sudo ln -sf libcuda.so /usr/local/cuda-13.2/lib64/stubs/libcuda.so.1
         /usr/local/cuda-13.2/bin/nvcc --version
     - name: Install Python dependencies

From 6abd039f81a12581cd6fe348e4e96e527179005e Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 25 May 2026 03:12:27 -0700
Subject: [PATCH 55/59] split packages

---
 .github/workflows/ccpp.yml          |  15 +
 .gitignore                          |   4 +-
 CMakeLists.txt                      |  57 ++--
 KunQuant/jit/KunMLIR.py             |  16 ++
 KunQuant/jit/cuda.py                | 384 +------------------------
 KunQuant/passes/CodegenMLIR.py      | 427 +---------------------------
 KunQuantMLIR/__init__.py            |   0
 KunQuantMLIR/codegen_mlir.py        | 416 +++++++++++++++++++++++++++
 KunQuantMLIR/jit_cuda.py            | 373 ++++++++++++++++++++++++
 mlir/lib/KunCuda/CMakeLists.txt     |   9 +-
 mlir/lib/Python/CMakeLists.txt      |  12 +-
 mlir/test/lit.cfg.py                |   2 +-
 python/kunquant_mlir/pyproject.toml |   3 +
 python/kunquant_mlir/setup.py       | 143 ++++++++++
 14 files changed, 1032 insertions(+), 829 deletions(-)
 create mode 100644 KunQuant/jit/KunMLIR.py
 create mode 100644 KunQuantMLIR/__init__.py
 create mode 100644 KunQuantMLIR/codegen_mlir.py
 create mode 100644 KunQuantMLIR/jit_cuda.py
 create mode 100644 python/kunquant_mlir/pyproject.toml
 create mode 100644 python/kunquant_mlir/setup.py

diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 0818769..c271b3a 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -199,6 +199,7 @@ jobs:
       run: |
         cmake -S . -B build/mlir-ci -G Ninja \
           -DCMAKE_BUILD_TYPE=Release \
+          -DKUN_BUILD_CPU_RUNNER=OFF \
           -DKUN_BUILD_MLIR=ON \
           -DLLVM_DIR="$LLVM_PREFIX/lib/cmake/llvm" \
           -DMLIR_DIR="$LLVM_PREFIX/lib/cmake/mlir" \
@@ -212,3 +213,17 @@ jobs:
         CUDA_PATH: /usr/local/cuda-13.2
         CUDA_HOME: /usr/local/cuda-13.2
       run: cmake --build build/mlir-ci --target check-kun-mlir --parallel 4
+    - name: Check KunQuant-MLIR imports
+      env:
+        CUDA_PATH: /usr/local/cuda-13.2
+        CUDA_HOME: /usr/local/cuda-13.2
+      run: |
+        export LD_LIBRARY_PATH="$LLVM_PREFIX/lib:/usr/local/cuda-13.2/lib64/stubs:${LD_LIBRARY_PATH:-}"
+        python - <<'PY'
+        import KunQuantMLIR.KunMLIR as direct
+        from KunQuant.jit import KunMLIR as compat
+        import KunQuant.jit.KunMLIR as submodule
+        assert direct.__file__ == compat.__file__ == submodule.__file__
+        assert "/KunQuantMLIR/" in direct.__file__
+        print(direct.__file__)
+        PY
diff --git a/.gitignore b/.gitignore
index e0dcd8f..4f534f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,6 @@ dist/*
 *.pyd
 *.dll
 *.lib
-.codex
\ No newline at end of file
+.codex
+python/kunquant_mlir/build/*
+python/kunquant_mlir/KunQuant_MLIR.egg-info/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc68754..ca22587 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ option(KUN_AVX512DQ "Enable AVX512DQ instruction set" OFF)
 option(KUN_AVX512VL "Enable AVX512VL instruction set" OFF)
 option(KUN_NO_AVX2 "Disable AVX2 and FMA instruction set" OFF)
 option(KUN_SANITIZER "Enable sanitizer" OFF)
+option(KUN_BUILD_CPU_RUNNER "Build the CPU KunRunner extension/runtime" ON)
 
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "(Clang|GNU|AppleClang)")
@@ -72,49 +73,53 @@ else()
     endif()
 endif()
 
-
-file(GLOB_RECURSE KunRuntimeSrc ${PROJECT_SOURCE_DIR}/cpp/Kun/*.cpp
-    ${PROJECT_SOURCE_DIR}/cpp/KunSIMD/*.cpp)
-add_library(KunRuntime SHARED ${KunRuntimeSrc})
-target_compile_definitions(KunRuntime PRIVATE KUN_CORE_LIB=1)
-if (NOT WIN32)
-    target_link_libraries(KunRuntime PRIVATE dl)
+if(NOT DEFINED PYTHON_EXECUTABLE)
+    set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
 endif()
 
-file(GLOB_RECURSE KunPythonSrc ${PROJECT_SOURCE_DIR}/cpp/Python/*.cpp)
-# STABLE_ABI: build a single Python-version-independent .abi3.so on
-# CPython ≥ 3.12; nanobind silently disables this and falls back to
-# regular ABI on older Pythons.  Saves us one rebuild per Python
-# minor we want to ship.
-nanobind_add_module(KunRunner STABLE_ABI ${KunPythonSrc})
+message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}")
 
-target_link_libraries(KunRunner PUBLIC KunRuntime)
+if(KUN_BUILD_CPU_RUNNER)
+    file(GLOB_RECURSE KunRuntimeSrc ${PROJECT_SOURCE_DIR}/cpp/Kun/*.cpp
+        ${PROJECT_SOURCE_DIR}/cpp/KunSIMD/*.cpp)
+    add_library(KunRuntime SHARED ${KunRuntimeSrc})
+    target_compile_definitions(KunRuntime PRIVATE KUN_CORE_LIB=1)
+    if (NOT WIN32)
+        target_link_libraries(KunRuntime PRIVATE dl)
+    endif()
 
+    file(GLOB_RECURSE KunPythonSrc ${PROJECT_SOURCE_DIR}/cpp/Python/*.cpp)
+    # STABLE_ABI: build a single Python-version-independent .abi3.so on
+    # CPython ≥ 3.12; nanobind silently disables this and falls back to
+    # regular ABI on older Pythons.  Saves us one rebuild per Python
+    # minor we want to ship.
+    nanobind_add_module(KunRunner STABLE_ABI ${KunPythonSrc})
 
+    target_link_libraries(KunRunner PUBLIC KunRuntime)
 
-file(GLOB_RECURSE KunTestSrc ${PROJECT_SOURCE_DIR}/tests/cpp/*.cpp)
-add_library(KunTest SHARED EXCLUDE_FROM_ALL ${KunTestSrc})
-target_link_libraries(KunTest KunRuntime)
 
 
-file(GLOB_RECURSE KunCApiTestSrc ${PROJECT_SOURCE_DIR}/tests/capi/*.cpp)
-add_executable(KunCApiTest EXCLUDE_FROM_ALL ${KunCApiTestSrc})
-target_link_libraries(KunCApiTest KunRuntime)
-add_dependencies(KunCApiTest KunTest)
+    file(GLOB_RECURSE KunTestSrc ${PROJECT_SOURCE_DIR}/tests/cpp/*.cpp)
+    add_library(KunTest SHARED EXCLUDE_FROM_ALL ${KunTestSrc})
+    target_link_libraries(KunTest KunRuntime)
 
-if(NOT DEFINED PYTHON_EXECUTABLE)
-    set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
-endif()
 
-message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}")
+    file(GLOB_RECURSE KunCApiTestSrc ${PROJECT_SOURCE_DIR}/tests/capi/*.cpp)
+    add_executable(KunCApiTest EXCLUDE_FROM_ALL ${KunCApiTestSrc})
+    target_link_libraries(KunCApiTest KunRuntime)
+    add_dependencies(KunCApiTest KunTest)
 
-add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
+    add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
+endif()
 
 #===------------------------------------------------------------------------===#
 # Optional MLIR backend (kun-opt + kunir/kungpu dialects)
 #===------------------------------------------------------------------------===#
 option(KUN_BUILD_MLIR "Build MLIR backend with kunir/kungpu dialects" OFF)
 if(KUN_BUILD_MLIR)
+  set(KUN_MLIR_PYTHON_PACKAGE_DIR "${PROJECT_SOURCE_DIR}/KunQuantMLIR"
+      CACHE PATH "Output directory for the KunQuant-MLIR Python package binaries")
+
   find_package(MLIR REQUIRED CONFIG)
   find_package(CUDAToolkit REQUIRED)
   message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
diff --git a/KunQuant/jit/KunMLIR.py b/KunQuant/jit/KunMLIR.py
new file mode 100644
index 0000000..a519d61
--- /dev/null
+++ b/KunQuant/jit/KunMLIR.py
@@ -0,0 +1,16 @@
+"""Compatibility shim for the optional KunQuant-MLIR extension module."""
+
+from importlib import import_module as _import_module
+import sys as _sys
+
+try:
+    _KunMLIR = _import_module("KunQuantMLIR.KunMLIR")
+except ModuleNotFoundError as e:
+    if e.name and e.name.startswith("KunQuantMLIR"):
+        raise ImportError(
+            "KunQuant MLIR extension is not installed. "
+            "Install KunQuant-MLIR to use KunQuant.jit.KunMLIR."
+        ) from e
+    raise
+
+_sys.modules[__name__] = _KunMLIR
diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
index eda0d05..7d81d5a 100644
--- a/KunQuant/jit/cuda.py
+++ b/KunQuant/jit/cuda.py
@@ -1,373 +1,11 @@
-"""GPU JIT entry point for KunQuant.
-
-Mirror of `KunQuant.jit.cfake.compileit` but targets a CUDA backend
-through the KunMLIR / kunir pipeline.  Reuses the existing Driver pass
-list (`Driver.optimize`) so any IR rewrites the CPU path benefits from
-also apply here — only the codegen layer is replaced.
-
-Two-tier config split matches the CPU path:
-
-  * Per-Function knobs live in `KunCompilerConfig` (the CPU-shared
-    dataclass): `dtype`, `blocking_len`, `partition_factor`,
-    `input_layout` / `output_layout` (TS only on GPU), `options`.
-  * Compile-/link-time knobs live in `CudaCompilerConfig`: `gpu_arch`,
-    `warps_per_cta`, `smem_size`, `occupancy`, `opt_level`,
-    `toolkit_path`.  Shared across every Function in a `Library`.
-
-Single-Function compile::
-
-    from KunQuant.jit import KunMLIR
-    from KunQuant.jit.cuda import compile_func, CudaCompilerConfig
-    from KunQuant.Driver import KunCompilerConfig
-
-    exe = compile_func(f,
-                        KunCompilerConfig(input_layout="TS",
-                                            output_layout="TS"),
-                        CudaCompilerConfig(gpu_arch="sm_80"))
-    executor = KunMLIR.Executor()                       # default stream
-    out = executor.runGraph(exe, {"a": cp_a, "b": cp_b})  # length auto-inferred
-    executor.synchronize()
-
-Multi-Function compile (CPU `cfake.compileit` shape)::
-
-    from KunQuant.jit.cuda import compileit, CudaCompilerConfig
-    from KunQuant.Driver import KunCompilerConfig
-
-    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS")
-    ccfg = CudaCompilerConfig(gpu_arch="sm_80")
-    lib = compileit([("mod1", f1, kcfg), ("mod2", f2, kcfg)],
-                     "my_lib", ccfg)
-    exe = lib.getModule("mod1")
-"""
-
-from __future__ import annotations
-import os
-from dataclasses import dataclass
-from typing import List, Tuple
-
-from KunQuant.jit import KunMLIR
-
-from KunQuant.Driver import KunCompilerConfig, optimize, post_optimize
-from KunQuant.Op import Input, Output, MayRequireWholeTime
-from KunQuant.passes import do_partition
-from KunQuant.passes.InferWindow import infer_window
-from KunQuant.Stage import Function
-from KunQuant.passes.CodegenMLIR import TargetSpec, translate_function
-
-
-# Sentinel passed via kunir.func's `unreliable_count` attribute to mean
-# "this partition needs the full time history; the runtime must launch
-# it as a single chunk".  Kept in sync with the kunir verifier (which
-# only allows -1 or non-negative) and the CUDA runtime's `computeChunkPlan`.
-_WHOLE_TIME_UNRELIABLE = -1
-
-
-# Standard locations searched when CudaCompilerConfig.toolkit_path is left
-# empty.  A toolkit dir must contain `nvvm/libdevice/libdevice.10.bc` (the
-# upstream `gpu-module-to-binary` pass links libdevice into the LLVM
-# module) and `bin/ptxas` (PTX → cubin).
-_TOOLKIT_ENV_VARS  = ("CUDA_HOME", "CUDA_PATH", "CUDA_TOOLKIT_PATH",
-                       "CUDA_ROOT")
-_TOOLKIT_FALLBACKS = ("/usr/local/cuda", "/opt/cuda", "/opt/nvidia/cuda")
-
-
-def _is_toolkit_dir(path: str) -> bool:
-    return (path
-            and os.path.isfile(os.path.join(path, "nvvm", "libdevice",
-                                              "libdevice.10.bc"))
-            and os.path.isfile(os.path.join(path, "bin", "ptxas")))
-
-
-def find_cuda_toolkit(override: str = "") -> str:
-    """Locate a CUDA toolkit root suitable for `gpu-module-to-binary`.
-
-    Search order:
-      1. `override` (if non-empty and looks like a toolkit dir)
-      2. $CUDA_HOME / $CUDA_PATH / $CUDA_TOOLKIT_PATH / $CUDA_ROOT
-      3. Standard install paths (/usr/local/cuda, /opt/cuda, …)
-
-    Raises FileNotFoundError if nothing usable is found — the message
-    lists every location consulted so the caller can fix the env.
-    """
-    tried = []
-    if override:
-        tried.append(f"override={override!r}")
-        if _is_toolkit_dir(override):
-            return override
-    for env in _TOOLKIT_ENV_VARS:
-        val = os.environ.get(env, "")
-        if val:
-            tried.append(f"${env}={val!r}")
-            if _is_toolkit_dir(val):
-                return val
-    for fallback in _TOOLKIT_FALLBACKS:
-        tried.append(f"fallback={fallback!r}")
-        if _is_toolkit_dir(fallback):
-            return fallback
-    raise FileNotFoundError(
-        "Could not locate a CUDA toolkit (need "
-        "<root>/nvvm/libdevice/libdevice.10.bc and <root>/bin/ptxas). "
-        "Searched: " + ", ".join(tried) +
-        ". Set CUDA_PATH or pass toolkit_path explicitly.")
-
-
-@dataclass
-class CudaCompilerConfig:
-    """Compile- / link-time knobs that are shared across every Function
-    in a `Library`.  Per-Function graph-rewriting knobs (dtype,
-    blocking_len, partition_factor, layout, pass options) live in
-    `KunQuant.Driver.KunCompilerConfig` instead — the same dataclass
-    the CPU path uses.
-    """
-    gpu_arch:    str = "sm_80"
-
-    # kunir.target_spec — graph-wide for v0.  `vector_size` is taken
-    # from the per-Function `KunCompilerConfig.blocking_len` at compile
-    # time (the two are the same concept on GPU).
-    occupancy:     int = 1
-    warps_per_cta: int = 4
-    smem_size:     int = 49152
-
-    # LLVM optimization level (forwarded to #nvvm.target<O = ...>).
-    opt_level:     int  = 3
-    # Path to the CUDA toolkit (where libdevice.10.bc + ptxas live).
-    # Empty → upstream search: CUDA_HOME / CUDA_PATH / standard locations.
-    toolkit_path:  str  = ""
-
-
-def _resolve_vector_size(kcfg: KunCompilerConfig) -> int:
-    """On GPU `vector_size` (kunir target_spec) is the same as
-    `blocking_len` from the per-Function config.  Default to 1 (scalar
-    kunir) if the user didn't specify."""
-    return 1 if kcfg.blocking_len is None else int(kcfg.blocking_len)
-
-
-def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
-    """`Driver.optimize`'s `options` dict for the GPU path.
-
-    `blocking_len` is needed by some decompose paths (it's also the
-    skip-list / naive cost-model knob).  `kcfg.options` flows through
-    first — including `no_fast_stat`, `opt_reduce`, `fast_log`, all of
-    which the GPU lowering now supports.
-
-    `no_skip_list=True` is forced unconditionally and overrides any
-    user-provided value: the kunir codegen has no lowering for
-    `SkipList*` ops, so the naive `ForeachBackWindow + Reduce*` path
-    is the only one that lowers on GPU.
-
-    `may_slice_time=True` is the safe GPU default because the runtime can
-    split a single graph launch into multiple time chunks.  Users who
-    guarantee single-chunk launches may explicitly set it to False.
-    """
-    opts: dict = {"blocking_len": _resolve_vector_size(kcfg)}
-    if kcfg.options:
-        opts.update(kcfg.options)
-    opts.setdefault("may_slice_time", True)
-    opts["no_skip_list"] = True
-    # Pipeline lowering doesn't know about ExpMovingAvg or the
-    # WindowedLinearRegression* family — turn on the Accumulator-based
-    # expansion pass instead.
-    opts["experimental_expand"] = True
-    return opts
-
-
-def _to_dtype_token(dtype: str) -> str:
-    if dtype == "float":  return "f32"
-    if dtype == "double": return "f64"
-    raise ValueError(f"compile_func: unsupported dtype '{dtype}' "
-                       f"(supported: float, double — kunir today only "
-                       f"lowers float on GPU)")
-
-
-def _validate_kun_cfg(kcfg: KunCompilerConfig) -> None:
-    """GPU path only supports TS layout on both input and output (kunir
-    runtime is TS-major).  dtype must be a kunir-supported token."""
-    if kcfg.input_layout != "TS":
-        raise ValueError(
-            f"GPU backend only supports input_layout='TS', got "
-            f"{kcfg.input_layout!r}")
-    if kcfg.output_layout != "TS":
-        raise ValueError(
-            f"GPU backend only supports output_layout='TS', got "
-            f"{kcfg.output_layout!r}")
-    if kcfg.dtype not in ("float", "double"):
-        raise ValueError(
-            f"KunCompilerConfig.dtype must be 'float' or 'double', got "
-            f"{kcfg.dtype!r}")
-
-
-def _graph_io_names(f: Function):
-    """User-facing graph inputs/outputs.  Captured BEFORE optimize +
-    do_partition because those passes mutate `f` and may scatter the
-    Input/Output ops across multiple sub-Functions (some of which then
-    look like 'TEMP' from the partition's POV but stay user-visible at
-    the graph boundary)."""
-    ins  = [op.attrs["name"] for op in f.ops if isinstance(op, Input)]
-    outs = [op.attrs["name"] for op in f.ops if isinstance(op, Output)]
-    if not ins:
-        raise ValueError("compile_func: function has no Input ops")
-    if not outs:
-        raise ValueError("compile_func: function has no Output ops")
-    return ins, outs
-
-
-def _run_full_pipeline(f: Function, kcfg: KunCompilerConfig):
-    """Run optimize / partition / post_optimize.  Returns
-    `(impl, global_unreliable)`; the second is a pre-partition
-    `infer_window` snapshot keyed by Output name.  Mutates `f`.
-    """
-    options = _gpu_pass_options(kcfg)
-    optimize(f, options)
-    global_unreliable = infer_window(f, options)
-    _mainf, impl = do_partition(f, kcfg.partition_factor, options)
-    post_optimize(impl, options)
-    return impl, global_unreliable
-
-
-def _translate_partitions(impl, kcfg: KunCompilerConfig,
-                            ccfg: CudaCompilerConfig):
-    """Emit one kunir.func per partitioned Function into a single
-    KunMLIR module (single `gpu.module` with N siblings).  Cross-
-    partition buffers stitch up automatically because each impl's
-    Input/Output names match the producing/consuming partition's
-    Output/Input names.
-
-    Cross-sectional partitions (currently: cs_rank) bypass the kunir
-    pipeline entirely — `translate_function` returns a descriptor and
-    we collect those into `external_kernels`, which the C++ side
-    appends to the executable's kernel list without ever generating
-    LLVM IR / PTX for them.
-
-    Returns (ModuleOp, list[dict]) — the second element is the list
-    of external-kernel descriptors to forward to KunMLIR.compile.
-    """
-    target = TargetSpec(occupancy=ccfg.occupancy,
-                          warps_per_cta=ccfg.warps_per_cta,
-                          smem_size=ccfg.smem_size,
-                          vector_size=_resolve_vector_size(kcfg))
-    ir = KunMLIR.IRBuilder()
-    dtype = _to_dtype_token(kcfg.dtype)
-    externals = []
-    for sub in impl:
-        # Per-kernel warmup is partition-local: the runtime serialises
-        # kernel launches so an upstream kernel's reliable writes are
-        # already in place by the time a downstream kernel reads.  Each
-        # kernel's chunk grid only needs to cover its own local warmup.
-        if any(isinstance(op, MayRequireWholeTime)
-                and op.is_whole_time_required()
-                for op in sub.ops):
-            per_kernel_unreliable = _WHOLE_TIME_UNRELIABLE
-        else:
-            per_kernel_unreliable = max(infer_window(sub).values(), default=0)
-        ext = translate_function(sub, target, ir, dtype=dtype,
-                                   unreliable_count=per_kernel_unreliable)
-        if ext is not None:
-            externals.append(ext)
-    return ir.finish(), externals
-
-
-def compile_func(f: Function, kcfg: KunCompilerConfig,
-                   ccfg: CudaCompilerConfig) -> KunMLIR.Executable:
-    """Compile a single KunQuant Function to a GPU `KunMLIR.Executable`.
-
-    Pipeline mirrors `KunQuant.jit.cfake.compileit` on the CPU path:
-
-      1. Capture user-facing Input/Output names (graph_inputs/outputs).
-      2. Run Driver.optimize on `f` in place.
-      3. do_partition splits `f` into one or more sub-Functions.
-      4. post_optimize per sub-Function (TempWindowElim + MergeLoops + …).
-      5. Translate each sub-Function into a kunir.func (siblings in one
-         gpu.module).
-      6. Hand off to KunMLIR.compile, which generates the cubin and
-         resolves cross-kernel data flow via I/O names.
-    """
-    _validate_kun_cfg(kcfg)
-
-    toolkit_path = find_cuda_toolkit(ccfg.toolkit_path)
-
-    graph_inputs, graph_outputs = _graph_io_names(f)
-    impl, global_unreliable = _run_full_pipeline(f, kcfg)
-    mod, externals = _translate_partitions(impl, kcfg, ccfg)
-
-    return KunMLIR.compile(
-        mod,
-        graph_inputs=graph_inputs,
-        graph_outputs=graph_outputs,
-        gpu_arch=ccfg.gpu_arch,
-        opt_level=ccfg.opt_level,
-        toolkit_path=toolkit_path,
-        external_kernels=externals,
-        # Forwarded for the no-JIT-kernel case: when every partition
-        # is external (e.g. a graph that is just `cs_rank(a)`), the
-        # MLIR module is empty and `data.warpsPerCta` would otherwise
-        # default to 1 — but the cs_rank launch uses it to size
-        # blockDim, so feed the config value through.
-        warps_per_cta=ccfg.warps_per_cta,
-        output_unreliable=global_unreliable,
-    )
-
-
-class Library:
-    """Bag of named `KunMLIR.Executable`s, mirroring the CPU `kr.Library`
-    shape so callers can compile multiple Functions in one go and look
-    them up by name.  Returned by the multi-Function `compileit` below.
-    """
-    def __init__(self, libname: str = "") -> None:
-        self.libname = libname
-        self._modules: dict = {}
-
-    def getModule(self, name: str) -> KunMLIR.Executable:
-        if name not in self._modules:
-            raise RuntimeError(
-                f"Library.getModule: no module named '{name}' "
-                f"(have: {sorted(self._modules)})")
-        return self._modules[name]
-
-    @property
-    def names(self):
-        """All compiled module names in registration order."""
-        return list(self._modules.keys())
-
-    def _add(self, name: str, exe: KunMLIR.Executable) -> None:
-        if name in self._modules:
-            raise RuntimeError(
-                f"Library: duplicate module name '{name}'")
-        self._modules[name] = exe
-
-
-def compileit(
-    funclist: List[Tuple[str, Function, KunCompilerConfig]],
-    libname: str,
-    compiler_config: CudaCompilerConfig,
-) -> Library:
-    """Compile a list of `(name, Function, KunCompilerConfig)` tuples
-    into a `Library`, mirroring the shape of
-    `KunQuant.jit.cfake.compileit(func, libname, compiler_config)`.
-
-    Each entry's third element is the per-Function `KunCompilerConfig`
-    (dtype / blocking_len / partition_factor / layout / pass options);
-    `compiler_config` is the GPU-wide `CudaCompilerConfig` applied to
-    every entry.  cfake's other arguments (`tempdir`, `keep_files`,
-    `load`) don't apply to the GPU path and are intentionally absent.
-
-    Returns a `Library` keyed by the tuple's `name`; look up individual
-    kernels via `lib.getModule(name)`.
-    """
-    lib = Library(libname=libname)
-    for name, f, kcfg in funclist:
-        lib._add(name, compile_func(f, kcfg, compiler_config))
-    return lib
-
-
-def to_mlir(f: Function, kcfg: KunCompilerConfig,
-              ccfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
-    """Run the same passes + translator as `compile_func`, but return
-    the KunMLIR module before PTX/CUBIN.  External (cs_rank) partitions
-    are absent from the returned module — they never become kunir
-    ops.  Useful for debugging the IR.  Mutates `f` in place (same
-    as `compile_func`)."""
-    _validate_kun_cfg(kcfg)
-    _graph_io_names(f)              # raises if no Input / Output ops
-    impl, _global_unreliable = _run_full_pipeline(f, kcfg)
-    mod, _externals = _translate_partitions(impl, kcfg, ccfg)
-    return mod
+"""Compatibility shim for the optional KunQuant-MLIR CUDA backend."""
+
+try:
+    from KunQuantMLIR.jit_cuda import *  # noqa: F401,F403
+except ModuleNotFoundError as e:
+    if e.name and e.name.startswith("KunQuantMLIR"):
+        raise ImportError(
+            "KunQuant MLIR/CUDA backend is not installed. "
+            "Install KunQuant-MLIR to use KunQuant.jit.cuda."
+        ) from e
+    raise
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
index 6cef15f..f578fc2 100644
--- a/KunQuant/passes/CodegenMLIR.py
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -1,416 +1,11 @@
-"""Translate a (post-optimize) KunQuant Function into a KunMLIR module
-holding a single kunir.func inside a gpu.module.
-
-This is the GPU-side counterpart to passes.CodegenCpp.codegen_cpp; it
-runs after the same Driver.optimize() pipeline the CPU path uses, then
-walks the lowered IR and emits kunir ops via the KunMLIR.IRBuilder
-pybind class.
-
-Scope (v0): only the ops kunir currently supports.
-  - Elemwise binary: Add, Sub, Mul, Div, Max, Min
-  - Elemwise unary:  Abs, Log, Sign
-  - Cross-sectional: Rank, Scale
-  - Windowed:        WindowedTempOutput, ForeachBackWindow + IterValue,
-                      ReduceAdd / ReduceMul / ReduceMax / ReduceMin
-  - Boundaries:      Input, Output
-
-Anything else raises NotImplementedError with the offending op printed.
-"""
-
-from __future__ import annotations
-from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    # KunMLIR is a compiled extension built alongside the MLIR support,
-    # only imported here for type checking — no runtime dependency added
-    # to the codegen path itself.
-    from KunQuant.jit import KunMLIR
-
-from KunQuant.Op import (
-    OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
-    WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
-    WindowedTrait, Rank, Scale,
-)
-from KunQuant.ops.ElewiseOp import (
-    Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
-    AddConst, SubConst, MulConst, DivConst,
-    GreaterThanConst, LessThanConst,
-    GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
-    And, Or, Not, Select,
-)
-from KunQuant.ops.ReduceOp import (
-    ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
-    ReduceArgMax, ReduceArgMin, ReduceRank,
-)
-from KunQuant.ops.MiscOp import (
-    BackRef, FastWindowedSum,
-    Accumulator, SetAccumulator, ReturnFirstValue,
-)
-from KunQuant.Stage import Function
-
-
-# ── Op-class → IRBuilder method dispatch ────────────────────────────
-
-_BINARY = {
-    Add: "add", Sub: "sub", Mul: "mul", Div: "div",
-    Max: "max", Min: "min",
-    GreaterThan:  "gt", GreaterEqual: "ge",
-    LessThan:     "lt", LessEqual:    "le",
-    Equals:       "eq",
-    And:          "and_", Or:         "or_",
-}
-# Const-on-one-side variants — emit ConstantOp + the matching binary op.
-# `swap=True` puts the scalar on the LEFT (e.g. SubConst(x, v, swap=True)
-# means `v - x`, where for plain SubConst it would mean `x - v`).
-_BINARY_CONST = {
-    AddConst: "add", SubConst: "sub", MulConst: "mul", DivConst: "div",
-    GreaterThanConst: "gt", LessThanConst: "lt",
-}
-_UNARY = {
-    Abs: "abs", Log: "log", Exp: "exp", Sqrt: "sqrt", Sign: "sign",
-    Not: "not_",
-    # NOTE: cross-sectional ops are intentionally absent.
-    # partitions are routed to a pre-compiled CUmodule by
-    # `_maybe_external_partition` below; they never become kunir ops.
-}
-_REDUCE = {
-    ReduceAdd: "reduce_add", ReduceMul: "reduce_mul",
-    ReduceMax: "reduce_max", ReduceMin: "reduce_min",
-    ReduceArgMin: "reduce_argmin", ReduceArgMax: "reduce_argmax",
-}
-# Reduces that need a 2nd input (the outer-scope "current" value).
-# `ReduceRank(iter_val, current)` is the only one today; kept as a separate
-# table so `_emit_reduction` can dispatch without conflating arity.
-_REDUCE_WITH_CURRENT = {
-    ReduceRank: "reduce_rank",
-}
-
-
-# ── Target spec carrier ─────────────────────────────────────────────
-
-class TargetSpec:
-    """GPU launch parameters mirrored from kunir.target_spec."""
-    def __init__(self, *, occupancy: int = 1, warps_per_cta: int = 4,
-                 smem_size: int = 49152, vector_size: int = 1):
-        self.occupancy     = occupancy
-        self.warps_per_cta = warps_per_cta
-        self.smem_size     = smem_size
-        self.vector_size   = vector_size
-
-
-# ── Helpers ─────────────────────────────────────────────────────────
-
-def _kunir_symbol(name: str) -> str:
-    """Coerce a partition name into a valid kunir / PTX symbol.
-
-    The partitioner derives a partition's name from the names of its
-    Output ops; when a partition is "intermediate-only" (every output
-    is consumed by a downstream partition, none is a user-facing
-    Output), those names come from `OpBase.hash_hex` which starts with
-    a digit half the time.  Digits are fine for buffer-table keys
-    (CPU runtime indexes by name) but ptxas rejects them as
-    `.entry` symbols.
-
-    Prefix any such name with a single `_` so the kunir.func symbol
-    is always a valid identifier, while leaving `input_names` /
-    `output_names` (the public buffer-table keys) untouched.
-    """
-    if name and name[0].isdigit():
-        return "_" + name
-    return name
-
-def _index_loop_members(f: Function) -> Tuple[
-        Dict[ForeachBackWindow, List[OpBase]],
-        Dict[ForeachBackWindow, List[ReductionOp]]]:
-    """For each ForeachBackWindow in `f`, collect the body ops (those
-    whose `_parent_loop` is the loop) and the reduction ops (whose
-    `get_loop()` is the loop).  Both lists keep f.ops topo order."""
-    body_ops: Dict[ForeachBackWindow, List[OpBase]] = {}
-    reductions: Dict[ForeachBackWindow, List[ReductionOp]] = {}
-    for op in f.ops:
-        if isinstance(op, ReductionOp):
-            loop = op.get_loop()
-            reductions.setdefault(loop, []).append(op)
-        elif op.get_parent() is not None:
-            body_ops.setdefault(op.get_parent(), []).append(op)
-    return body_ops, reductions
-
-
-def _emit_simple(op: OpBase,
-                  ir: KunMLIR.IRBuilder,
-                  val_map: Dict[OpBase, KunMLIR.Value],
-                  ts_1: KunMLIR.Type) -> KunMLIR.Value:
-    """Emit a non-control-flow op via IRBuilder dispatch.  `ts_1` is the
-    kunir ts type with maxLookback=1, used by ops whose result has no
-    input to infer the element type from (currently only ConstantOp)."""
-    cls = type(op)
-    if cls in _BINARY:
-        getattr(ir, _BINARY[cls])
-        return getattr(ir, _BINARY[cls])(val_map[op.inputs[0]],
-                                           val_map[op.inputs[1]])
-    if cls in _BINARY_CONST:
-        # Materialize the scalar attr as a kunir.constant, then emit
-        # the matching binary op.  `swap=True` puts the scalar on the
-        # left-hand side (matters for Sub/Div, no-op for Add/Mul).
-        scalar = float(op.attrs["value"])
-        const_val = ir.constant(scalar, ts_1)
-        x = val_map[op.inputs[0]]
-        ir_op = getattr(ir, _BINARY_CONST[cls])
-        if op.attrs.get("swap", False):
-            return ir_op(const_val, x)
-        return ir_op(x, const_val)
-    if cls in _UNARY:
-        return getattr(ir, _UNARY[cls])(val_map[op.inputs[0]])
-    if isinstance(op, WindowedTempOutput):
-        return ir.windowed_output(val_map[op.inputs[0]],
-                                    int(op.attrs["window"]))
-    if isinstance(op, BackRef):
-        return ir.back_ref(val_map[op.inputs[0]], int(op.attrs["window"]))
-    if isinstance(op, FastWindowedSum):
-        return ir.fast_windowed_sum(val_map[op.inputs[0]],
-                                      int(op.attrs["window"]))
-    if isinstance(op, Select):
-        return ir.select(val_map[op.inputs[0]],
-                          val_map[op.inputs[1]],
-                          val_map[op.inputs[2]])
-    if isinstance(op, ConstantOp):
-        v = op.attrs["value"]
-        fv = float("nan") if v == "nan" else float(v)
-        return ir.constant(fv, ts_1)
-    if isinstance(op, WindowLoopIndex):
-        # Resolved by the kunir → kungpu pass to the enclosing
-        # for_each_back_window's induction variable.
-        return ir.window_loop_index(ts_1)
-    if isinstance(op, Accumulator):
-        # The Python op's `inputs[0]` is a keep-alive in the graph IR;
-        # it does NOT feed the slot.  The `name` attr is informational;
-        # each op identifies a distinct slot (kunir.accumulator is not
-        # Pure, so MLIR CSE will not dedup two accumulators).
-        init_v = op.attrs["init_val"]
-        init_f = float("nan") if init_v == "nan" else float(init_v)
-        return ir.accumulator(op.attrs["name"], ts_1, init_f)
-    if isinstance(op, SetAccumulator):
-        # Side-effecting (writes the slot) but also returns the slot's
-        # new value for the current step (`mask ? value : prev`), so
-        # downstream consumers can use the SetAccumulator's SSA result
-        # directly — matches the CPU C++ SetAccumulator semantics.
-        return ir.set_accumulator(val_map[op.inputs[0]],
-                                   val_map[op.inputs[1]],
-                                   val_map[op.inputs[2]])
-    if isinstance(op, ReturnFirstValue):
-        # In the Python graph IR, ReturnFirstValue's only job is to keep
-        # side-effecting siblings (SetAccumulator etc.) reachable from a
-        # graph output so the GC does not drop them.  In SSA-MLIR the
-        # side-effect ops are preserved by their own MemWrite semantics;
-        # ReturnFirstValue carries no new MLIR-level meaning, so we just
-        # forward the first input's Value.  Other inputs were already
-        # emitted in topo order before we got here.
-        return val_map[op.inputs[0]]
-    raise NotImplementedError(
-        f"CodegenMLIR: op type {cls.__name__} is not supported by the "
-        f"GPU backend yet (op = {op})")
-
-
-def _emit_reduction(op: ReductionOp,
-                     ir: KunMLIR.IRBuilder,
-                     val_map: Dict[OpBase, KunMLIR.Value]) -> KunMLIR.Value:
-    cls = type(op)
-    if cls in _REDUCE_WITH_CURRENT:
-        # ReduceRank(iter_val, current): 2 inputs.
-        if len(op.inputs) != 2:
-            raise NotImplementedError(
-                f"CodegenMLIR: {cls.__name__} expects 2 inputs (iter, "
-                f"current); got {len(op.inputs)} (op = {op})")
-        return getattr(ir, _REDUCE_WITH_CURRENT[cls])(
-            val_map[op.inputs[0]], val_map[op.inputs[1]])
-    if cls not in _REDUCE:
-        raise NotImplementedError(
-            f"CodegenMLIR: reduction {cls.__name__} not supported yet "
-            f"(op = {op})")
-    if len(op.inputs) != 1:
-        raise NotImplementedError(
-            f"CodegenMLIR: reductions with init_val are not supported "
-            f"yet (op = {op})")
-    return getattr(ir, _REDUCE[cls])(val_map[op.inputs[0]])
-
-
-# ── Main entry point ────────────────────────────────────────────────
-
-def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
-    """If `f` is a partition the GPU runtime handles as a pre-compiled
-    external kernel (bundled PTX loaded as a separate CUmodule), return
-    a descriptor dict that KunMLIR.compile() should append to the
-    executable's kernel list.  Otherwise return None.
-
-    The descriptor matches what KunMLIR.compile's `external_kernels=`
-    parameter expects:
-        {"name": <str>, "kind": <str>,
-         "inputs": [<str>...], "outputs": [<str>...]}
-
-    Detection mirrors CodegenCpp's "simple cross-sectional fast path"
-    (CodegenCpp.codegen_cpp's `len(f.ops) == 3` check): a partition
-    whose only compute op is a supported `SimpleCrossSectionalOp`
-    (currently Rank or Scale).  The partitioner places every CrossSectionalOp into its own
-    partition without other compute, so this shape is what we get.
-
-    The `kind` string is `cs_<op>_f{32,64}`.  Do not fabricate kinds for
-    cross-sectional ops unless the C++ runtime has a matching bundled
-    external kernel.
-    """
-    compute = [op for op in f.ops
-                if not isinstance(op, (Input, Output))]
-    if len(compute) != 1 or not isinstance(compute[0], SimpleCrossSectionalOp):
-        return None
-    if not isinstance(compute[0], (Rank, Scale)):
-        return None
-    inputs  = [op for op in f.ops if isinstance(op, Input)]
-    outputs = [op for op in f.ops if isinstance(op, Output)]
-    if len(inputs) != 1 or len(outputs) != 1:
-        return None  # surprising shape, let the regular path emit an error
-    if dtype not in ("f32", "f64"):
-        return None
-    op_kind = compute[0].__class__.__name__.lower()
-    return {
-        "name":    f.name or f"cs_{op_kind}",
-        "kind":    f"cs_{op_kind}_{dtype}",
-        "inputs":  [op.attrs["name"] for op in inputs],
-        "outputs": [op.attrs["name"] for op in outputs],
-    }
-
-
-def translate_function(f: Function, target: TargetSpec,
-                        ir: KunMLIR.IRBuilder,
-                        dtype: str = "f32",
-                        unreliable_count: int = 0) -> Optional[dict]:
-    """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
-
-    If `f` is an externally-dispatched partition (e.g. a single cs_rank
-    op handled by the bundled cs_rank.ptx CUmodule), emit nothing into
-    the IRBuilder and return its descriptor dict so the caller can pass
-    it to KunMLIR.compile()'s `external_kernels=` list.  Otherwise
-    return `None` after emitting a kunir.func.
-
-    `unreliable_count` is the partition-local warmup depth — the caller
-    (`KunQuant.jit.cuda`) computes it via `infer_window(f)` on this
-    post-partition Function and feeds it in.
-    """
-    ext = _maybe_external_partition(f, dtype)
-    if ext is not None:
-        return ext
-
-    # 1.  Boundary ops in topo order — the kunir.func's I/O.
-    inputs:  List[Input]  = [op for op in f.ops if isinstance(op, Input)]
-    outputs: List[Output] = [op for op in f.ops if isinstance(op, Output)]
-    if not inputs:
-        raise ValueError("CodegenMLIR: function has no Input ops")
-    if not outputs:
-        raise ValueError("CodegenMLIR: function has no Output ops")
-
-    in_names  = [op.attrs["name"] for op in inputs]
-    out_names = [op.attrs["name"] for op in outputs]
-
-    # 2.  Pre-index loop members so we can emit each loop's body +
-    #     reductions contiguously (regardless of topo interleaving with
-    #     other loops).
-    body_ops_by_loop, reductions_by_loop = _index_loop_members(f)
-
-    # 3.  Open the kunir.func.  All inputs are ts<dtype, inf>; all
-    #     graph results are ts<dtype, 1>.
-    ts_inf = ir.ts_type(dtype, 0)
-    ts_1   = ir.ts_type(dtype, 1)
-
-    func_args = ir.begin_func(
-        name=_kunir_symbol(f.name or "kernel"),
-        input_types=[ts_inf] * len(inputs),
-        input_names=in_names,
-        output_names=out_names,
-        occupancy=target.occupancy, warps_per_cta=target.warps_per_cta,
-        smem_size=target.smem_size, vector_size=target.vector_size,
-        unreliable_count=unreliable_count,
-        result_types=[ts_1] * len(outputs),
-    )
-
-    val_map: Dict[OpBase, KunMLIR.Value] = {}
-    emitted = set()
-    for inp, val in zip(inputs, func_args):
-        val_map[inp] = val
-        emitted.add(inp)
-
-    # 4.  Walk f.ops in topo order, emitting one op (or one whole loop)
-    #     at a time.
-    for op in f.ops:
-        if op in emitted:
-            continue
-        if isinstance(op, Input):
-            continue                      # already mapped from func_args
-        if isinstance(op, Output):
-            # An Output may also be read as a windowed source within the
-            # same partition; emit a kunir.output_ref so downstream sees
-            # its gmem buffer as a ts handle.
-            if any(isinstance(u, WindowedTrait)
-                    for u in f.op_to_id[op].uses):
-                val_map[op] = ir.output_ref(op.attrs["name"],
-                                              val_map[op.inputs[0]])
-            continue                      # handled at the end via Return
-        if isinstance(op, ForeachBackWindow):
-            _emit_loop(op, ir, val_map, ts_1,
-                        body_ops_by_loop.get(op, []),
-                        reductions_by_loop.get(op, []),
-                        emitted)
-            continue
-        if isinstance(op, ReductionOp) or op.get_parent() is not None:
-            # Should have been emitted as part of its enclosing loop;
-            # if we hit it here, the loop never appeared first — that's
-            # a bug in topo sort or in this translator's iteration.
-            raise RuntimeError(
-                f"CodegenMLIR: reduction/body op visited before its "
-                f"enclosing loop ({op})")
-        val_map[op] = _emit_simple(op, ir, val_map, ts_1)
-        emitted.add(op)
-
-    # 5.  Close the function with Outputs in declared order.
-    return_values = [val_map[o.inputs[0]] for o in outputs]
-    ir.end_func(return_values)
-    return None
-
-
-def _emit_loop(loop: ForeachBackWindow,
-                ir: KunMLIR.IRBuilder,
-                val_map: Dict[OpBase, KunMLIR.Value],
-                ts_1: KunMLIR.Type,
-                body_ops: List[OpBase],
-                reductions: List[ReductionOp],
-                emitted: set) -> None:
-    loop_input_vals = [val_map[i] for i in loop.inputs]
-    n_results = len(reductions)
-    if n_results == 0:
-        raise NotImplementedError(
-            f"CodegenMLIR: ForeachBackWindow with no reductions "
-            f"(loop = {loop})")
-
-    block_args = ir.begin_for_each_back_window(
-        inputs=loop_input_vals,
-        window=int(loop.attrs["window"]),
-        result_types=[ts_1] * n_results,
-    )
-    # Block args mirror loop.inputs positionally.  Map the source-op
-    # → block-arg so IterValue can be resolved to the right one.
-    block_arg_by_src = {src: block_args[i]
-                          for i, src in enumerate(loop.inputs)}
-
-    # Body ops: IterValue → block arg; everything else uses _emit_simple.
-    for body_op in body_ops:
-        if isinstance(body_op, IterValue):
-            val_map[body_op] = block_arg_by_src[body_op.inputs[1]]
-        else:
-            val_map[body_op] = _emit_simple(body_op, ir, val_map, ts_1)
-        emitted.add(body_op)
-
-    # Reductions accumulate yield values, in topo order.
-    yield_vals = [_emit_reduction(r, ir, val_map) for r in reductions]
-    loop_results = ir.end_for_each_back_window(yield_vals)
-    for r, lr in zip(reductions, loop_results):
-        val_map[r] = lr
-        emitted.add(r)
-
-    emitted.add(loop)
+"""Compatibility shim for the optional KunQuant-MLIR codegen backend."""
+
+try:
+    from KunQuantMLIR.codegen_mlir import *  # noqa: F401,F403
+except ModuleNotFoundError as e:
+    if e.name and e.name.startswith("KunQuantMLIR"):
+        raise ImportError(
+            "KunQuant MLIR codegen backend is not installed. "
+            "Install KunQuant-MLIR to use KunQuant.passes.CodegenMLIR."
+        ) from e
+    raise
diff --git a/KunQuantMLIR/__init__.py b/KunQuantMLIR/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/KunQuantMLIR/codegen_mlir.py b/KunQuantMLIR/codegen_mlir.py
new file mode 100644
index 0000000..f92b643
--- /dev/null
+++ b/KunQuantMLIR/codegen_mlir.py
@@ -0,0 +1,416 @@
+"""Translate a (post-optimize) KunQuant Function into a KunMLIR module
+holding a single kunir.func inside a gpu.module.
+
+This is the GPU-side counterpart to passes.CodegenCpp.codegen_cpp; it
+runs after the same Driver.optimize() pipeline the CPU path uses, then
+walks the lowered IR and emits kunir ops via the KunMLIR.IRBuilder
+pybind class.
+
+Scope (v0): only the ops kunir currently supports.
+  - Elemwise binary: Add, Sub, Mul, Div, Max, Min
+  - Elemwise unary:  Abs, Log, Sign
+  - Cross-sectional: Rank, Scale
+  - Windowed:        WindowedTempOutput, ForeachBackWindow + IterValue,
+                      ReduceAdd / ReduceMul / ReduceMax / ReduceMin
+  - Boundaries:      Input, Output
+
+Anything else raises NotImplementedError with the offending op printed.
+"""
+
+from __future__ import annotations
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # KunMLIR is a compiled extension built alongside the MLIR support,
+    # only imported here for type checking — no runtime dependency added
+    # to the codegen path itself.
+    from KunQuantMLIR import KunMLIR
+
+from KunQuant.Op import (
+    OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
+    WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
+    WindowedTrait, Rank, Scale,
+)
+from KunQuant.ops.ElewiseOp import (
+    Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
+    AddConst, SubConst, MulConst, DivConst,
+    GreaterThanConst, LessThanConst,
+    GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
+    And, Or, Not, Select,
+)
+from KunQuant.ops.ReduceOp import (
+    ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
+    ReduceArgMax, ReduceArgMin, ReduceRank,
+)
+from KunQuant.ops.MiscOp import (
+    BackRef, FastWindowedSum,
+    Accumulator, SetAccumulator, ReturnFirstValue,
+)
+from KunQuant.Stage import Function
+
+
+# ── Op-class → IRBuilder method dispatch ────────────────────────────
+
+_BINARY = {
+    Add: "add", Sub: "sub", Mul: "mul", Div: "div",
+    Max: "max", Min: "min",
+    GreaterThan:  "gt", GreaterEqual: "ge",
+    LessThan:     "lt", LessEqual:    "le",
+    Equals:       "eq",
+    And:          "and_", Or:         "or_",
+}
+# Const-on-one-side variants — emit ConstantOp + the matching binary op.
+# `swap=True` puts the scalar on the LEFT (e.g. SubConst(x, v, swap=True)
+# means `v - x`, where for plain SubConst it would mean `x - v`).
+_BINARY_CONST = {
+    AddConst: "add", SubConst: "sub", MulConst: "mul", DivConst: "div",
+    GreaterThanConst: "gt", LessThanConst: "lt",
+}
+_UNARY = {
+    Abs: "abs", Log: "log", Exp: "exp", Sqrt: "sqrt", Sign: "sign",
+    Not: "not_",
+    # NOTE: cross-sectional ops are intentionally absent.
+    # partitions are routed to a pre-compiled CUmodule by
+    # `_maybe_external_partition` below; they never become kunir ops.
+}
+_REDUCE = {
+    ReduceAdd: "reduce_add", ReduceMul: "reduce_mul",
+    ReduceMax: "reduce_max", ReduceMin: "reduce_min",
+    ReduceArgMin: "reduce_argmin", ReduceArgMax: "reduce_argmax",
+}
+# Reduces that need a 2nd input (the outer-scope "current" value).
+# `ReduceRank(iter_val, current)` is the only one today; kept as a separate
+# table so `_emit_reduction` can dispatch without conflating arity.
+_REDUCE_WITH_CURRENT = {
+    ReduceRank: "reduce_rank",
+}
+
+
+# ── Target spec carrier ─────────────────────────────────────────────
+
+class TargetSpec:
+    """GPU launch parameters mirrored from kunir.target_spec."""
+    def __init__(self, *, occupancy: int = 1, warps_per_cta: int = 4,
+                 smem_size: int = 49152, vector_size: int = 1):
+        self.occupancy     = occupancy
+        self.warps_per_cta = warps_per_cta
+        self.smem_size     = smem_size
+        self.vector_size   = vector_size
+
+
+# ── Helpers ─────────────────────────────────────────────────────────
+
+def _kunir_symbol(name: str) -> str:
+    """Coerce a partition name into a valid kunir / PTX symbol.
+
+    The partitioner derives a partition's name from the names of its
+    Output ops; when a partition is "intermediate-only" (every output
+    is consumed by a downstream partition, none is a user-facing
+    Output), those names come from `OpBase.hash_hex` which starts with
+    a digit half the time.  Digits are fine for buffer-table keys
+    (CPU runtime indexes by name) but ptxas rejects them as
+    `.entry` symbols.
+
+    Prefix any such name with a single `_` so the kunir.func symbol
+    is always a valid identifier, while leaving `input_names` /
+    `output_names` (the public buffer-table keys) untouched.
+    """
+    if name and name[0].isdigit():
+        return "_" + name
+    return name
+
+def _index_loop_members(f: Function) -> Tuple[
+        Dict[ForeachBackWindow, List[OpBase]],
+        Dict[ForeachBackWindow, List[ReductionOp]]]:
+    """For each ForeachBackWindow in `f`, collect the body ops (those
+    whose `_parent_loop` is the loop) and the reduction ops (whose
+    `get_loop()` is the loop).  Both lists keep f.ops topo order."""
+    body_ops: Dict[ForeachBackWindow, List[OpBase]] = {}
+    reductions: Dict[ForeachBackWindow, List[ReductionOp]] = {}
+    for op in f.ops:
+        if isinstance(op, ReductionOp):
+            loop = op.get_loop()
+            reductions.setdefault(loop, []).append(op)
+        elif op.get_parent() is not None:
+            body_ops.setdefault(op.get_parent(), []).append(op)
+    return body_ops, reductions
+
+
+def _emit_simple(op: OpBase,
+                  ir: KunMLIR.IRBuilder,
+                  val_map: Dict[OpBase, KunMLIR.Value],
+                  ts_1: KunMLIR.Type) -> KunMLIR.Value:
+    """Emit a non-control-flow op via IRBuilder dispatch.  `ts_1` is the
+    kunir ts type with maxLookback=1, used by ops whose result has no
+    input to infer the element type from (currently only ConstantOp)."""
+    cls = type(op)
+    if cls in _BINARY:
+        getattr(ir, _BINARY[cls])
+        return getattr(ir, _BINARY[cls])(val_map[op.inputs[0]],
+                                           val_map[op.inputs[1]])
+    if cls in _BINARY_CONST:
+        # Materialize the scalar attr as a kunir.constant, then emit
+        # the matching binary op.  `swap=True` puts the scalar on the
+        # left-hand side (matters for Sub/Div, no-op for Add/Mul).
+        scalar = float(op.attrs["value"])
+        const_val = ir.constant(scalar, ts_1)
+        x = val_map[op.inputs[0]]
+        ir_op = getattr(ir, _BINARY_CONST[cls])
+        if op.attrs.get("swap", False):
+            return ir_op(const_val, x)
+        return ir_op(x, const_val)
+    if cls in _UNARY:
+        return getattr(ir, _UNARY[cls])(val_map[op.inputs[0]])
+    if isinstance(op, WindowedTempOutput):
+        return ir.windowed_output(val_map[op.inputs[0]],
+                                    int(op.attrs["window"]))
+    if isinstance(op, BackRef):
+        return ir.back_ref(val_map[op.inputs[0]], int(op.attrs["window"]))
+    if isinstance(op, FastWindowedSum):
+        return ir.fast_windowed_sum(val_map[op.inputs[0]],
+                                      int(op.attrs["window"]))
+    if isinstance(op, Select):
+        return ir.select(val_map[op.inputs[0]],
+                          val_map[op.inputs[1]],
+                          val_map[op.inputs[2]])
+    if isinstance(op, ConstantOp):
+        v = op.attrs["value"]
+        fv = float("nan") if v == "nan" else float(v)
+        return ir.constant(fv, ts_1)
+    if isinstance(op, WindowLoopIndex):
+        # Resolved by the kunir → kungpu pass to the enclosing
+        # for_each_back_window's induction variable.
+        return ir.window_loop_index(ts_1)
+    if isinstance(op, Accumulator):
+        # The Python op's `inputs[0]` is a keep-alive in the graph IR;
+        # it does NOT feed the slot.  The `name` attr is informational;
+        # each op identifies a distinct slot (kunir.accumulator is not
+        # Pure, so MLIR CSE will not dedup two accumulators).
+        init_v = op.attrs["init_val"]
+        init_f = float("nan") if init_v == "nan" else float(init_v)
+        return ir.accumulator(op.attrs["name"], ts_1, init_f)
+    if isinstance(op, SetAccumulator):
+        # Side-effecting (writes the slot) but also returns the slot's
+        # new value for the current step (`mask ? value : prev`), so
+        # downstream consumers can use the SetAccumulator's SSA result
+        # directly — matches the CPU C++ SetAccumulator semantics.
+        return ir.set_accumulator(val_map[op.inputs[0]],
+                                   val_map[op.inputs[1]],
+                                   val_map[op.inputs[2]])
+    if isinstance(op, ReturnFirstValue):
+        # In the Python graph IR, ReturnFirstValue's only job is to keep
+        # side-effecting siblings (SetAccumulator etc.) reachable from a
+        # graph output so the GC does not drop them.  In SSA-MLIR the
+        # side-effect ops are preserved by their own MemWrite semantics;
+        # ReturnFirstValue carries no new MLIR-level meaning, so we just
+        # forward the first input's Value.  Other inputs were already
+        # emitted in topo order before we got here.
+        return val_map[op.inputs[0]]
+    raise NotImplementedError(
+        f"CodegenMLIR: op type {cls.__name__} is not supported by the "
+        f"GPU backend yet (op = {op})")
+
+
+def _emit_reduction(op: ReductionOp,
+                     ir: KunMLIR.IRBuilder,
+                     val_map: Dict[OpBase, KunMLIR.Value]) -> KunMLIR.Value:
+    cls = type(op)
+    if cls in _REDUCE_WITH_CURRENT:
+        # ReduceRank(iter_val, current): 2 inputs.
+        if len(op.inputs) != 2:
+            raise NotImplementedError(
+                f"CodegenMLIR: {cls.__name__} expects 2 inputs (iter, "
+                f"current); got {len(op.inputs)} (op = {op})")
+        return getattr(ir, _REDUCE_WITH_CURRENT[cls])(
+            val_map[op.inputs[0]], val_map[op.inputs[1]])
+    if cls not in _REDUCE:
+        raise NotImplementedError(
+            f"CodegenMLIR: reduction {cls.__name__} not supported yet "
+            f"(op = {op})")
+    if len(op.inputs) != 1:
+        raise NotImplementedError(
+            f"CodegenMLIR: reductions with init_val are not supported "
+            f"yet (op = {op})")
+    return getattr(ir, _REDUCE[cls])(val_map[op.inputs[0]])
+
+
+# ── Main entry point ────────────────────────────────────────────────
+
+def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
+    """If `f` is a partition the GPU runtime handles as a pre-compiled
+    external kernel (bundled PTX loaded as a separate CUmodule), return
+    a descriptor dict that KunMLIR.compile() should append to the
+    executable's kernel list.  Otherwise return None.
+
+    The descriptor matches what KunMLIR.compile's `external_kernels=`
+    parameter expects:
+        {"name": <str>, "kind": <str>,
+         "inputs": [<str>...], "outputs": [<str>...]}
+
+    Detection mirrors CodegenCpp's "simple cross-sectional fast path"
+    (CodegenCpp.codegen_cpp's `len(f.ops) == 3` check): a partition
+    whose only compute op is a supported `SimpleCrossSectionalOp`
+    (currently Rank or Scale).  The partitioner places every CrossSectionalOp into its own
+    partition without other compute, so this shape is what we get.
+
+    The `kind` string is `cs_<op>_f{32,64}`.  Do not fabricate kinds for
+    cross-sectional ops unless the C++ runtime has a matching bundled
+    external kernel.
+    """
+    compute = [op for op in f.ops
+                if not isinstance(op, (Input, Output))]
+    if len(compute) != 1 or not isinstance(compute[0], SimpleCrossSectionalOp):
+        return None
+    if not isinstance(compute[0], (Rank, Scale)):
+        return None
+    inputs  = [op for op in f.ops if isinstance(op, Input)]
+    outputs = [op for op in f.ops if isinstance(op, Output)]
+    if len(inputs) != 1 or len(outputs) != 1:
+        return None  # surprising shape, let the regular path emit an error
+    if dtype not in ("f32", "f64"):
+        return None
+    op_kind = compute[0].__class__.__name__.lower()
+    return {
+        "name":    f.name or f"cs_{op_kind}",
+        "kind":    f"cs_{op_kind}_{dtype}",
+        "inputs":  [op.attrs["name"] for op in inputs],
+        "outputs": [op.attrs["name"] for op in outputs],
+    }
+
+
+def translate_function(f: Function, target: TargetSpec,
+                        ir: KunMLIR.IRBuilder,
+                        dtype: str = "f32",
+                        unreliable_count: int = 0) -> Optional[dict]:
+    """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
+
+    If `f` is an externally-dispatched partition (e.g. a single cs_rank
+    op handled by the bundled cs_rank.ptx CUmodule), emit nothing into
+    the IRBuilder and return its descriptor dict so the caller can pass
+    it to KunMLIR.compile()'s `external_kernels=` list.  Otherwise
+    return `None` after emitting a kunir.func.
+
+    `unreliable_count` is the partition-local warmup depth — the caller
+    (`KunQuant.jit.cuda`) computes it via `infer_window(f)` on this
+    post-partition Function and feeds it in.
+    """
+    ext = _maybe_external_partition(f, dtype)
+    if ext is not None:
+        return ext
+
+    # 1.  Boundary ops in topo order — the kunir.func's I/O.
+    inputs:  List[Input]  = [op for op in f.ops if isinstance(op, Input)]
+    outputs: List[Output] = [op for op in f.ops if isinstance(op, Output)]
+    if not inputs:
+        raise ValueError("CodegenMLIR: function has no Input ops")
+    if not outputs:
+        raise ValueError("CodegenMLIR: function has no Output ops")
+
+    in_names  = [op.attrs["name"] for op in inputs]
+    out_names = [op.attrs["name"] for op in outputs]
+
+    # 2.  Pre-index loop members so we can emit each loop's body +
+    #     reductions contiguously (regardless of topo interleaving with
+    #     other loops).
+    body_ops_by_loop, reductions_by_loop = _index_loop_members(f)
+
+    # 3.  Open the kunir.func.  All inputs are ts<dtype, inf>; all
+    #     graph results are ts<dtype, 1>.
+    ts_inf = ir.ts_type(dtype, 0)
+    ts_1   = ir.ts_type(dtype, 1)
+
+    func_args = ir.begin_func(
+        name=_kunir_symbol(f.name or "kernel"),
+        input_types=[ts_inf] * len(inputs),
+        input_names=in_names,
+        output_names=out_names,
+        occupancy=target.occupancy, warps_per_cta=target.warps_per_cta,
+        smem_size=target.smem_size, vector_size=target.vector_size,
+        unreliable_count=unreliable_count,
+        result_types=[ts_1] * len(outputs),
+    )
+
+    val_map: Dict[OpBase, KunMLIR.Value] = {}
+    emitted = set()
+    for inp, val in zip(inputs, func_args):
+        val_map[inp] = val
+        emitted.add(inp)
+
+    # 4.  Walk f.ops in topo order, emitting one op (or one whole loop)
+    #     at a time.
+    for op in f.ops:
+        if op in emitted:
+            continue
+        if isinstance(op, Input):
+            continue                      # already mapped from func_args
+        if isinstance(op, Output):
+            # An Output may also be read as a windowed source within the
+            # same partition; emit a kunir.output_ref so downstream sees
+            # its gmem buffer as a ts handle.
+            if any(isinstance(u, WindowedTrait)
+                    for u in f.op_to_id[op].uses):
+                val_map[op] = ir.output_ref(op.attrs["name"],
+                                              val_map[op.inputs[0]])
+            continue                      # handled at the end via Return
+        if isinstance(op, ForeachBackWindow):
+            _emit_loop(op, ir, val_map, ts_1,
+                        body_ops_by_loop.get(op, []),
+                        reductions_by_loop.get(op, []),
+                        emitted)
+            continue
+        if isinstance(op, ReductionOp) or op.get_parent() is not None:
+            # Should have been emitted as part of its enclosing loop;
+            # if we hit it here, the loop never appeared first — that's
+            # a bug in topo sort or in this translator's iteration.
+            raise RuntimeError(
+                f"CodegenMLIR: reduction/body op visited before its "
+                f"enclosing loop ({op})")
+        val_map[op] = _emit_simple(op, ir, val_map, ts_1)
+        emitted.add(op)
+
+    # 5.  Close the function with Outputs in declared order.
+    return_values = [val_map[o.inputs[0]] for o in outputs]
+    ir.end_func(return_values)
+    return None
+
+
+def _emit_loop(loop: ForeachBackWindow,
+                ir: KunMLIR.IRBuilder,
+                val_map: Dict[OpBase, KunMLIR.Value],
+                ts_1: KunMLIR.Type,
+                body_ops: List[OpBase],
+                reductions: List[ReductionOp],
+                emitted: set) -> None:
+    loop_input_vals = [val_map[i] for i in loop.inputs]
+    n_results = len(reductions)
+    if n_results == 0:
+        raise NotImplementedError(
+            f"CodegenMLIR: ForeachBackWindow with no reductions "
+            f"(loop = {loop})")
+
+    block_args = ir.begin_for_each_back_window(
+        inputs=loop_input_vals,
+        window=int(loop.attrs["window"]),
+        result_types=[ts_1] * n_results,
+    )
+    # Block args mirror loop.inputs positionally.  Map the source-op
+    # → block-arg so IterValue can be resolved to the right one.
+    block_arg_by_src = {src: block_args[i]
+                          for i, src in enumerate(loop.inputs)}
+
+    # Body ops: IterValue → block arg; everything else uses _emit_simple.
+    for body_op in body_ops:
+        if isinstance(body_op, IterValue):
+            val_map[body_op] = block_arg_by_src[body_op.inputs[1]]
+        else:
+            val_map[body_op] = _emit_simple(body_op, ir, val_map, ts_1)
+        emitted.add(body_op)
+
+    # Reductions accumulate yield values, in topo order.
+    yield_vals = [_emit_reduction(r, ir, val_map) for r in reductions]
+    loop_results = ir.end_for_each_back_window(yield_vals)
+    for r, lr in zip(reductions, loop_results):
+        val_map[r] = lr
+        emitted.add(r)
+
+    emitted.add(loop)
diff --git a/KunQuantMLIR/jit_cuda.py b/KunQuantMLIR/jit_cuda.py
new file mode 100644
index 0000000..7e57cb8
--- /dev/null
+++ b/KunQuantMLIR/jit_cuda.py
@@ -0,0 +1,373 @@
+"""GPU JIT entry point for KunQuant-MLIR.
+
+Mirror of `KunQuant.jit.cfake.compileit` but targets a CUDA backend
+through the KunMLIR / kunir pipeline.  Reuses the existing Driver pass
+list (`Driver.optimize`) so any IR rewrites the CPU path benefits from
+also apply here — only the codegen layer is replaced.
+
+Two-tier config split matches the CPU path:
+
+  * Per-Function knobs live in `KunCompilerConfig` (the CPU-shared
+    dataclass): `dtype`, `blocking_len`, `partition_factor`,
+    `input_layout` / `output_layout` (TS only on GPU), `options`.
+  * Compile-/link-time knobs live in `CudaCompilerConfig`: `gpu_arch`,
+    `warps_per_cta`, `smem_size`, `occupancy`, `opt_level`,
+    `toolkit_path`.  Shared across every Function in a `Library`.
+
+Single-Function compile::
+
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import compile_func, CudaCompilerConfig
+    from KunQuant.Driver import KunCompilerConfig
+
+    exe = compile_func(f,
+                        KunCompilerConfig(input_layout="TS",
+                                            output_layout="TS"),
+                        CudaCompilerConfig(gpu_arch="sm_80"))
+    executor = KunMLIR.Executor()                       # default stream
+    out = executor.runGraph(exe, {"a": cp_a, "b": cp_b})  # length auto-inferred
+    executor.synchronize()
+
+Multi-Function compile (CPU `cfake.compileit` shape)::
+
+    from KunQuant.jit.cuda import compileit, CudaCompilerConfig
+    from KunQuant.Driver import KunCompilerConfig
+
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS")
+    ccfg = CudaCompilerConfig(gpu_arch="sm_80")
+    lib = compileit([("mod1", f1, kcfg), ("mod2", f2, kcfg)],
+                     "my_lib", ccfg)
+    exe = lib.getModule("mod1")
+"""
+
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from typing import List, Tuple
+
+from KunQuantMLIR import KunMLIR
+
+from KunQuant.Driver import KunCompilerConfig, optimize, post_optimize
+from KunQuant.Op import Input, Output, MayRequireWholeTime
+from KunQuant.passes import do_partition
+from KunQuant.passes.InferWindow import infer_window
+from KunQuant.Stage import Function
+from KunQuantMLIR.codegen_mlir import TargetSpec, translate_function
+
+
+# Sentinel passed via kunir.func's `unreliable_count` attribute to mean
+# "this partition needs the full time history; the runtime must launch
+# it as a single chunk".  Kept in sync with the kunir verifier (which
+# only allows -1 or non-negative) and the CUDA runtime's `computeChunkPlan`.
+_WHOLE_TIME_UNRELIABLE = -1
+
+
+# Standard locations searched when CudaCompilerConfig.toolkit_path is left
+# empty.  A toolkit dir must contain `nvvm/libdevice/libdevice.10.bc` (the
+# upstream `gpu-module-to-binary` pass links libdevice into the LLVM
+# module) and `bin/ptxas` (PTX → cubin).
+_TOOLKIT_ENV_VARS  = ("CUDA_HOME", "CUDA_PATH", "CUDA_TOOLKIT_PATH",
+                       "CUDA_ROOT")
+_TOOLKIT_FALLBACKS = ("/usr/local/cuda", "/opt/cuda", "/opt/nvidia/cuda")
+
+
+def _is_toolkit_dir(path: str) -> bool:
+    return (path
+            and os.path.isfile(os.path.join(path, "nvvm", "libdevice",
+                                              "libdevice.10.bc"))
+            and os.path.isfile(os.path.join(path, "bin", "ptxas")))
+
+
+def find_cuda_toolkit(override: str = "") -> str:
+    """Locate a CUDA toolkit root suitable for `gpu-module-to-binary`.
+
+    Search order:
+      1. `override` (if non-empty and looks like a toolkit dir)
+      2. $CUDA_HOME / $CUDA_PATH / $CUDA_TOOLKIT_PATH / $CUDA_ROOT
+      3. Standard install paths (/usr/local/cuda, /opt/cuda, …)
+
+    Raises FileNotFoundError if nothing usable is found — the message
+    lists every location consulted so the caller can fix the env.
+    """
+    tried = []
+    if override:
+        tried.append(f"override={override!r}")
+        if _is_toolkit_dir(override):
+            return override
+    for env in _TOOLKIT_ENV_VARS:
+        val = os.environ.get(env, "")
+        if val:
+            tried.append(f"${env}={val!r}")
+            if _is_toolkit_dir(val):
+                return val
+    for fallback in _TOOLKIT_FALLBACKS:
+        tried.append(f"fallback={fallback!r}")
+        if _is_toolkit_dir(fallback):
+            return fallback
+    raise FileNotFoundError(
+        "Could not locate a CUDA toolkit (need "
+        "<root>/nvvm/libdevice/libdevice.10.bc and <root>/bin/ptxas). "
+        "Searched: " + ", ".join(tried) +
+        ". Set CUDA_PATH or pass toolkit_path explicitly.")
+
+
+@dataclass
+class CudaCompilerConfig:
+    """Compile- / link-time knobs that are shared across every Function
+    in a `Library`.  Per-Function graph-rewriting knobs (dtype,
+    blocking_len, partition_factor, layout, pass options) live in
+    `KunQuant.Driver.KunCompilerConfig` instead — the same dataclass
+    the CPU path uses.
+    """
+    gpu_arch:    str = "sm_80"
+
+    # kunir.target_spec — graph-wide for v0.  `vector_size` is taken
+    # from the per-Function `KunCompilerConfig.blocking_len` at compile
+    # time (the two are the same concept on GPU).
+    occupancy:     int = 1
+    warps_per_cta: int = 4
+    smem_size:     int = 49152
+
+    # LLVM optimization level (forwarded to #nvvm.target<O = ...>).
+    opt_level:     int  = 3
+    # Path to the CUDA toolkit (where libdevice.10.bc + ptxas live).
+    # Empty → upstream search: CUDA_HOME / CUDA_PATH / standard locations.
+    toolkit_path:  str  = ""
+
+
+def _resolve_vector_size(kcfg: KunCompilerConfig) -> int:
+    """On GPU `vector_size` (kunir target_spec) is the same as
+    `blocking_len` from the per-Function config.  Default to 1 (scalar
+    kunir) if the user didn't specify."""
+    return 1 if kcfg.blocking_len is None else int(kcfg.blocking_len)
+
+
+def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
+    """`Driver.optimize`'s `options` dict for the GPU path.
+
+    `blocking_len` is needed by some decompose paths (it's also the
+    skip-list / naive cost-model knob).  `kcfg.options` flows through
+    first — including `no_fast_stat`, `opt_reduce`, `fast_log`, all of
+    which the GPU lowering now supports.
+
+    `no_skip_list=True` is forced unconditionally and overrides any
+    user-provided value: the kunir codegen has no lowering for
+    `SkipList*` ops, so the naive `ForeachBackWindow + Reduce*` path
+    is the only one that lowers on GPU.
+
+    `may_slice_time=True` is the safe GPU default because the runtime can
+    split a single graph launch into multiple time chunks.  Users who
+    guarantee single-chunk launches may explicitly set it to False.
+    """
+    opts: dict = {"blocking_len": _resolve_vector_size(kcfg)}
+    if kcfg.options:
+        opts.update(kcfg.options)
+    opts.setdefault("may_slice_time", True)
+    opts["no_skip_list"] = True
+    # Pipeline lowering doesn't know about ExpMovingAvg or the
+    # WindowedLinearRegression* family — turn on the Accumulator-based
+    # expansion pass instead.
+    opts["experimental_expand"] = True
+    return opts
+
+
+def _to_dtype_token(dtype: str) -> str:
+    if dtype == "float":  return "f32"
+    if dtype == "double": return "f64"
+    raise ValueError(f"compile_func: unsupported dtype '{dtype}' "
+                       f"(supported: float, double — kunir today only "
+                       f"lowers float on GPU)")
+
+
+def _validate_kun_cfg(kcfg: KunCompilerConfig) -> None:
+    """GPU path only supports TS layout on both input and output (kunir
+    runtime is TS-major).  dtype must be a kunir-supported token."""
+    if kcfg.input_layout != "TS":
+        raise ValueError(
+            f"GPU backend only supports input_layout='TS', got "
+            f"{kcfg.input_layout!r}")
+    if kcfg.output_layout != "TS":
+        raise ValueError(
+            f"GPU backend only supports output_layout='TS', got "
+            f"{kcfg.output_layout!r}")
+    if kcfg.dtype not in ("float", "double"):
+        raise ValueError(
+            f"KunCompilerConfig.dtype must be 'float' or 'double', got "
+            f"{kcfg.dtype!r}")
+
+
+def _graph_io_names(f: Function):
+    """User-facing graph inputs/outputs.  Captured BEFORE optimize +
+    do_partition because those passes mutate `f` and may scatter the
+    Input/Output ops across multiple sub-Functions (some of which then
+    look like 'TEMP' from the partition's POV but stay user-visible at
+    the graph boundary)."""
+    ins  = [op.attrs["name"] for op in f.ops if isinstance(op, Input)]
+    outs = [op.attrs["name"] for op in f.ops if isinstance(op, Output)]
+    if not ins:
+        raise ValueError("compile_func: function has no Input ops")
+    if not outs:
+        raise ValueError("compile_func: function has no Output ops")
+    return ins, outs
+
+
+def _run_full_pipeline(f: Function, kcfg: KunCompilerConfig):
+    """Run optimize / partition / post_optimize.  Returns
+    `(impl, global_unreliable)`; the second is a pre-partition
+    `infer_window` snapshot keyed by Output name.  Mutates `f`.
+    """
+    options = _gpu_pass_options(kcfg)
+    optimize(f, options)
+    global_unreliable = infer_window(f, options)
+    _mainf, impl = do_partition(f, kcfg.partition_factor, options)
+    post_optimize(impl, options)
+    return impl, global_unreliable
+
+
+def _translate_partitions(impl, kcfg: KunCompilerConfig,
+                            ccfg: CudaCompilerConfig):
+    """Emit one kunir.func per partitioned Function into a single
+    KunMLIR module (single `gpu.module` with N siblings).  Cross-
+    partition buffers stitch up automatically because each impl's
+    Input/Output names match the producing/consuming partition's
+    Output/Input names.
+
+    Cross-sectional partitions (currently: cs_rank) bypass the kunir
+    pipeline entirely — `translate_function` returns a descriptor and
+    we collect those into `external_kernels`, which the C++ side
+    appends to the executable's kernel list without ever generating
+    LLVM IR / PTX for them.
+
+    Returns (ModuleOp, list[dict]) — the second element is the list
+    of external-kernel descriptors to forward to KunMLIR.compile.
+    """
+    target = TargetSpec(occupancy=ccfg.occupancy,
+                          warps_per_cta=ccfg.warps_per_cta,
+                          smem_size=ccfg.smem_size,
+                          vector_size=_resolve_vector_size(kcfg))
+    ir = KunMLIR.IRBuilder()
+    dtype = _to_dtype_token(kcfg.dtype)
+    externals = []
+    for sub in impl:
+        # Per-kernel warmup is partition-local: the runtime serialises
+        # kernel launches so an upstream kernel's reliable writes are
+        # already in place by the time a downstream kernel reads.  Each
+        # kernel's chunk grid only needs to cover its own local warmup.
+        if any(isinstance(op, MayRequireWholeTime)
+                and op.is_whole_time_required()
+                for op in sub.ops):
+            per_kernel_unreliable = _WHOLE_TIME_UNRELIABLE
+        else:
+            per_kernel_unreliable = max(infer_window(sub).values(), default=0)
+        ext = translate_function(sub, target, ir, dtype=dtype,
+                                   unreliable_count=per_kernel_unreliable)
+        if ext is not None:
+            externals.append(ext)
+    return ir.finish(), externals
+
+
+def compile_func(f: Function, kcfg: KunCompilerConfig,
+                   ccfg: CudaCompilerConfig) -> KunMLIR.Executable:
+    """Compile a single KunQuant Function to a GPU `KunMLIR.Executable`.
+
+    Pipeline mirrors `KunQuant.jit.cfake.compileit` on the CPU path:
+
+      1. Capture user-facing Input/Output names (graph_inputs/outputs).
+      2. Run Driver.optimize on `f` in place.
+      3. do_partition splits `f` into one or more sub-Functions.
+      4. post_optimize per sub-Function (TempWindowElim + MergeLoops + …).
+      5. Translate each sub-Function into a kunir.func (siblings in one
+         gpu.module).
+      6. Hand off to KunMLIR.compile, which generates the cubin and
+         resolves cross-kernel data flow via I/O names.
+    """
+    _validate_kun_cfg(kcfg)
+
+    toolkit_path = find_cuda_toolkit(ccfg.toolkit_path)
+
+    graph_inputs, graph_outputs = _graph_io_names(f)
+    impl, global_unreliable = _run_full_pipeline(f, kcfg)
+    mod, externals = _translate_partitions(impl, kcfg, ccfg)
+
+    return KunMLIR.compile(
+        mod,
+        graph_inputs=graph_inputs,
+        graph_outputs=graph_outputs,
+        gpu_arch=ccfg.gpu_arch,
+        opt_level=ccfg.opt_level,
+        toolkit_path=toolkit_path,
+        external_kernels=externals,
+        # Forwarded for the no-JIT-kernel case: when every partition
+        # is external (e.g. a graph that is just `cs_rank(a)`), the
+        # MLIR module is empty and `data.warpsPerCta` would otherwise
+        # default to 1 — but the cs_rank launch uses it to size
+        # blockDim, so feed the config value through.
+        warps_per_cta=ccfg.warps_per_cta,
+        output_unreliable=global_unreliable,
+    )
+
+
+class Library:
+    """Bag of named `KunMLIR.Executable`s, mirroring the CPU `kr.Library`
+    shape so callers can compile multiple Functions in one go and look
+    them up by name.  Returned by the multi-Function `compileit` below.
+    """
+    def __init__(self, libname: str = "") -> None:
+        self.libname = libname
+        self._modules: dict = {}
+
+    def getModule(self, name: str) -> KunMLIR.Executable:
+        if name not in self._modules:
+            raise RuntimeError(
+                f"Library.getModule: no module named '{name}' "
+                f"(have: {sorted(self._modules)})")
+        return self._modules[name]
+
+    @property
+    def names(self):
+        """All compiled module names in registration order."""
+        return list(self._modules.keys())
+
+    def _add(self, name: str, exe: KunMLIR.Executable) -> None:
+        if name in self._modules:
+            raise RuntimeError(
+                f"Library: duplicate module name '{name}'")
+        self._modules[name] = exe
+
+
+def compileit(
+    funclist: List[Tuple[str, Function, KunCompilerConfig]],
+    libname: str,
+    compiler_config: CudaCompilerConfig,
+) -> Library:
+    """Compile a list of `(name, Function, KunCompilerConfig)` tuples
+    into a `Library`, mirroring the shape of
+    `KunQuant.jit.cfake.compileit(func, libname, compiler_config)`.
+
+    Each entry's third element is the per-Function `KunCompilerConfig`
+    (dtype / blocking_len / partition_factor / layout / pass options);
+    `compiler_config` is the GPU-wide `CudaCompilerConfig` applied to
+    every entry.  cfake's other arguments (`tempdir`, `keep_files`,
+    `load`) don't apply to the GPU path and are intentionally absent.
+
+    Returns a `Library` keyed by the tuple's `name`; look up individual
+    kernels via `lib.getModule(name)`.
+    """
+    lib = Library(libname=libname)
+    for name, f, kcfg in funclist:
+        lib._add(name, compile_func(f, kcfg, compiler_config))
+    return lib
+
+
+def to_mlir(f: Function, kcfg: KunCompilerConfig,
+              ccfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
+    """Run the same passes + translator as `compile_func`, but return
+    the KunMLIR module before PTX/CUBIN.  External (cs_rank) partitions
+    are absent from the returned module — they never become kunir
+    ops.  Useful for debugging the IR.  Mutates `f` in place (same
+    as `compile_func`)."""
+    _validate_kun_cfg(kcfg)
+    _graph_io_names(f)              # raises if no Input / Output ops
+    impl, _global_unreliable = _run_full_pipeline(f, kcfg)
+    mod, _externals = _translate_partitions(impl, kcfg, ccfg)
+    return mod
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
index f289d2f..ffea52e 100644
--- a/mlir/lib/KunCuda/CMakeLists.txt
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -85,14 +85,13 @@ add_library(KunCudaRuntime SHARED
 # export its public class methods so downstream .so's (KunMLIR, host
 # runners, …) can resolve them at load time.
 #
-# We also put the .so next to the KunMLIR Python module, mirroring the
-# existing project pattern (INSTALL_RPATH=$ORIGIN at the top level): all
-# co-distributed shared libs live in one directory and find each other
-# as siblings.
+# We also put the .so next to the KunMLIR Python module
+# (INSTALL_RPATH=$ORIGIN at the top level): all co-distributed shared libs
+# live in one directory and find each other as siblings.
 set_target_properties(KunCudaRuntime PROPERTIES
     CXX_VISIBILITY_PRESET default
     VISIBILITY_INLINES_HIDDEN OFF
-    LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/KunQuant/jit")
+    LIBRARY_OUTPUT_DIRECTORY "${KUN_MLIR_PYTHON_PACKAGE_DIR}")
 
 target_include_directories(KunCudaRuntime PUBLIC
     "${PROJECT_SOURCE_DIR}/mlir/include")
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
index 4e3aef5..a11c2b6 100644
--- a/mlir/lib/Python/CMakeLists.txt
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -18,14 +18,12 @@ nanobind_add_module(KunMLIR STABLE_ABI
   PyModule.cpp
 )
 
-# Drop the .so directly into the source tree's KunQuant/jit/ so it
-# imports as `KunQuant.jit.KunMLIR` (alongside cuda.py) without any
-# PYTHONPATH gymnastics.  libKunCudaRuntime.so is co-located there too
-# (see mlir/lib/KunCuda/CMakeLists.txt) so the $ORIGIN rpath resolves
-# the sibling at load time — same pattern KunRunner ↔ KunRuntime use
-# under KunQuant/runner/.
+# Drop the .so directly into the KunQuant-MLIR Python package.  The
+# libKunCudaRuntime.so runtime is co-located there too (see
+# mlir/lib/KunCuda/CMakeLists.txt) so the $ORIGIN rpath resolves the sibling
+# at load time.
 set_target_properties(KunMLIR PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/KunQuant/jit")
+    LIBRARY_OUTPUT_DIRECTORY "${KUN_MLIR_PYTHON_PACKAGE_DIR}")
 
 target_link_libraries(KunMLIR PRIVATE
   # cuda.h + libcuda stub — the binding's runGraph wrapper calls
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9aba20b..13295e0 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -17,7 +17,7 @@ def prepend_env(name, entries):
     config.environment[name] = os.pathsep.join(entries)
 
 # Python GPU tests import the in-tree KunQuant package and load the freshly
-# built extension module from KunQuant/jit.
+# built KunQuant-MLIR extension module from KunQuantMLIR/.
 prepend_env("PYTHONPATH", [config.project_source_dir])
 
 # KunMLIR.abi3.so links against the downloaded LLVM/MLIR shared libraries.
diff --git a/python/kunquant_mlir/pyproject.toml b/python/kunquant_mlir/pyproject.toml
new file mode 100644
index 0000000..4a26837
--- /dev/null
+++ b/python/kunquant_mlir/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=69", "wheel", "cmake>=3.18"]
+build-backend = "setuptools.build_meta"
diff --git a/python/kunquant_mlir/setup.py b/python/kunquant_mlir/setup.py
new file mode 100644
index 0000000..d8e55ab
--- /dev/null
+++ b/python/kunquant_mlir/setup.py
@@ -0,0 +1,143 @@
+import datetime
+import os
+import platform
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+
+
+_STABLE_ABI_MIN = (3, 12)
+_HAS_STABLE_ABI = (
+    sys.version_info >= _STABLE_ABI_MIN
+    and platform.python_implementation() == "CPython"
+)
+
+_PKG_ROOT = Path(__file__).resolve().parent
+_REPO_ROOT = _PKG_ROOT.parents[1]
+_VERSION_BASE = "0.1.10"
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: Path):
+        super().__init__(name, sources=[], py_limited_api=_HAS_STABLE_ABI)
+        self.sourcedir = str(sourcedir)
+
+
+class CMakeBuildExtension(build_ext):
+    def build_extension(self, ext):
+        ext_dir = Path(self.get_ext_fullpath(ext.name)).resolve().parent
+        build_temp = Path(self.build_temp).resolve()
+        build_temp.mkdir(parents=True, exist_ok=True)
+        ext_dir.mkdir(parents=True, exist_ok=True)
+
+        build_type = os.environ.get("KUN_BUILD_TYPE", "Release")
+        python_exe = sys.executable
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={ext_dir}",
+            f"-DKUN_MLIR_PYTHON_PACKAGE_DIR={ext_dir}",
+            "-DKUN_BUILD_CPU_RUNNER=OFF",
+            "-DKUN_BUILD_MLIR=ON",
+            f"-DPython_EXECUTABLE={python_exe}",
+            f"-DPYTHON_EXECUTABLE={python_exe}",
+            f"-DCMAKE_BUILD_TYPE={build_type}",
+        ]
+
+        if os.environ.get("KUN_SANITIZER", "0") != "0":
+            cmake_args.append("-DKUN_SANITIZER=ON")
+        else:
+            cmake_args.append("-DKUN_SANITIZER=OFF")
+
+        if os.environ.get("KUN_NO_AVX2", "0") != "0":
+            cmake_args.append("-DKUN_NO_AVX2=ON")
+        else:
+            cmake_args.append("-DKUN_NO_AVX2=OFF")
+
+        for var in (
+            "LLVM_DIR",
+            "MLIR_DIR",
+            "CUDAToolkit_ROOT",
+            "CMAKE_CUDA_COMPILER",
+            "LLVM_EXTERNAL_LIT",
+        ):
+            value = os.environ.get(var)
+            if value:
+                cmake_args.append(f"-D{var}={value}")
+
+        generator = os.environ.get("CMAKE_GENERATOR")
+        if not generator and shutil.which("ninja"):
+            cmake_args.extend(["-G", "Ninja"])
+
+        if "PLAT" in os.environ:
+            del os.environ["PLAT"]
+
+        subprocess.check_call(
+            ["cmake", "-S", ext.sourcedir, "-B", str(build_temp)] + cmake_args
+        )
+
+        build_args = [
+            "cmake",
+            "--build",
+            str(build_temp),
+            "--target",
+            "KunMLIR",
+        ]
+        if platform.system() == "Windows":
+            build_args += ["--config", build_type]
+        else:
+            build_args += ["--parallel"]
+        subprocess.check_call(build_args)
+
+
+try:
+    from setuptools.command.bdist_wheel import bdist_wheel
+except ImportError:
+    from wheel.bdist_wheel import bdist_wheel  # type: ignore
+
+
+class BdistWheelABI3(bdist_wheel):
+    def finalize_options(self):
+        super().finalize_options()
+        if _HAS_STABLE_ABI:
+            self.py_limited_api = "cp{}{}".format(*_STABLE_ABI_MIN)
+            self.root_is_pure = False
+
+
+if os.environ.get("KUN_USE_GIT_VERSION", "0") != "0":
+    git_ver = "." + datetime.datetime.now().strftime("%Y%m%d")
+else:
+    git_ver = ""
+
+version = _VERSION_BASE + git_ver
+package_dir = os.path.relpath(_REPO_ROOT / "KunQuantMLIR", _PKG_ROOT)
+
+
+setup(
+    name="KunQuant-MLIR",
+    version=version,
+    description="Optional MLIR/CUDA backend for KunQuant",
+    long_description=(_REPO_ROOT / "Readme.md").read_text(encoding="utf-8"),
+    long_description_content_type="text/markdown",
+    author="Menooker",
+    author_email="menooker@live.com",
+    packages=["KunQuantMLIR"],
+    package_dir={"KunQuantMLIR": package_dir},
+    package_data={"KunQuantMLIR": ["*.so", "*.pyd", "*.dll", "*.dylib"]},
+    include_package_data=True,
+    ext_modules=[
+        CMakeExtension("KunQuantMLIR.KunMLIR", _REPO_ROOT),
+    ],
+    cmdclass={
+        "build_ext": CMakeBuildExtension,
+        "bdist_wheel": BdistWheelABI3,
+    },
+    python_requires=">=3.9",
+    install_requires=[
+        f"KunQuant=={version}",
+        "numpy",
+    ],
+    zip_safe=False,
+)

From bbd2b8f1e40f8a7dcbdcb49b553d36b34a8a3ef2 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 25 May 2026 19:04:31 -0700
Subject: [PATCH 56/59] publish

---
 .github/workflows/Dockerfile.mlir           |  30 +++++
 .github/workflows/ccpp.yml                  |  14 ++-
 .github/workflows/docker.yml                |  30 ++++-
 .github/workflows/publish-kunquant-mlir.yml | 125 ++++++++++++++++++++
 python/kunquant_mlir/setup.py               |   1 -
 5 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/Dockerfile.mlir
 create mode 100644 .github/workflows/publish-kunquant-mlir.yml

diff --git a/.github/workflows/Dockerfile.mlir b/.github/workflows/Dockerfile.mlir
new file mode 100644
index 0000000..9980418
--- /dev/null
+++ b/.github/workflows/Dockerfile.mlir
@@ -0,0 +1,30 @@
+FROM quay.io/pypa/manylinux_2_28_x86_64:2025.05.16-1
+
+RUN rm -rf /opt/_internal/pipx/venvs/cmake \
+    && dnf install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        dnf-plugins-core \
+        libxml2-devel \
+        libzstd-devel \
+        ninja-build \
+        pkgconf-pkg-config \
+        zlib-devel \
+    && dnf config-manager --add-repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \
+    && dnf install -y cuda-toolkit-13-2 \
+    && dnf clean all \
+    && rm -rf /var/cache/dnf
+
+RUN /opt/python/cp312-cp312/bin/python -m pip install --no-cache-dir lit \
+    && ln -s /opt/python/cp312-cp312/bin/lit /usr/local/bin/lit
+
+ENV CUDA_PATH=/usr/local/cuda-13.2
+ENV CUDA_HOME=/usr/local/cuda-13.2
+ENV CUDAToolkit_ROOT=/usr/local/cuda-13.2
+ENV CMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc
+ENV PATH=/usr/local/cuda-13.2/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/lib64:/usr/local/cuda-13.2/lib64/stubs
+
+RUN ln -sf libcuda.so /usr/local/cuda-13.2/lib64/stubs/libcuda.so.1
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index c271b3a..e1393d2 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -141,7 +141,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: .cache/cuda-13.2
-        key: ${{ runner.os }}-${{ runner.arch }}-cuda-toolkit-13.2-v1
+        key: ${{ runner.os }}-${{ runner.arch }}-cuda-mlir-minimal-13.2-v1
     - name: Install CUDA 13.2
       if: steps.cache-cuda.outputs.cache-hit != 'true'
       run: |
@@ -155,7 +155,11 @@ jobs:
         echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /" \
           | sudo tee /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
         sudo apt-get update
-        sudo apt-get install -y --no-install-recommends cuda-toolkit-13-2
+        sudo apt-get install -y --no-install-recommends \
+          cuda-nvcc-13-2 \
+          cuda-cudart-dev-13-2 \
+          cuda-driver-dev-13-2 \
+          libnvvm-13-2
         mkdir -p .cache
         sudo tar -C /usr/local -cf "$RUNNER_TEMP/cuda-13.2.tar" cuda-13.2
         tar -C .cache -xf "$RUNNER_TEMP/cuda-13.2.tar"
@@ -166,6 +170,12 @@ jobs:
           sudo ln -s "$GITHUB_WORKSPACE/.cache/cuda-13.2" /usr/local/cuda-13.2
         fi
         sudo ln -sf libcuda.so /usr/local/cuda-13.2/lib64/stubs/libcuda.so.1
+        test -x /usr/local/cuda-13.2/bin/nvcc
+        test -x /usr/local/cuda-13.2/bin/ptxas
+        test -f /usr/local/cuda-13.2/include/cuda.h
+        test -f /usr/local/cuda-13.2/include/cuda_runtime.h
+        test -f /usr/local/cuda-13.2/lib64/stubs/libcuda.so
+        test -f /usr/local/cuda-13.2/nvvm/libdevice/libdevice.10.bc
         /usr/local/cuda-13.2/bin/nvcc --version
     - name: Install Python dependencies
       run: |
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index a60b5b2..d1332ca 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -4,16 +4,34 @@ name: Create and publish a Docker image
 # Configures this workflow to run every time a change is pushed to the branch called `release`.
 on:
   workflow_dispatch:
+    inputs:
+      image:
+        description: Image to build
+        required: true
+        default: both
+        type: choice
+        options:
+        - core
+        - mlir
+        - both
 
 # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
 env:
   REGISTRY: ghcr.io
-  IMAGE_NAME: ${{ github.repository }}
 
 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
 jobs:
   build-and-push-image:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - target: core
+            image: ghcr.io/menooker/kunquant
+            dockerfile: Dockerfile
+          - target: mlir
+            image: ghcr.io/menooker/kunquant-mlir
+            dockerfile: Dockerfile.mlir
     # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
     permissions:
       contents: read
@@ -23,9 +41,11 @@ jobs:
       #
     steps:
       - name: Checkout repository
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         uses: actions/checkout@v4
       # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
       - name: Log in to the Container registry
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
         with:
           registry: ${{ env.REGISTRY }}
@@ -33,27 +53,31 @@ jobs:
           password: ${{ secrets.GITHUB_TOKEN }}
       # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
       - name: Extract metadata (tags, labels) for Docker
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         id: meta
         uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
         with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          images: ${{ matrix.image }}
       # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
       # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see [Usage](https://github.com/docker/build-push-action#usage) in the README of the `docker/build-push-action` repository.
       # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
       - name: Build and push Docker image
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         id: push
         uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
         with:
           context: .github/workflows
+          file: .github/workflows/${{ matrix.dockerfile }}
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
       
       # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [Using artifact attestations to establish provenance for builds](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds).
       - name: Generate artifact attestation
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         uses: actions/attest-build-provenance@v2
         with:
-          subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
+          subject-name: ${{ matrix.image }}
           subject-digest: ${{ steps.push.outputs.digest }}
           push-to-registry: true
       
diff --git a/.github/workflows/publish-kunquant-mlir.yml b/.github/workflows/publish-kunquant-mlir.yml
new file mode 100644
index 0000000..e8c365f
--- /dev/null
+++ b/.github/workflows/publish-kunquant-mlir.yml
@@ -0,0 +1,125 @@
+name: Publish KunQuant-MLIR to PyPI and TestPyPI
+
+on:
+  workflow_dispatch:
+    inputs:
+      target:
+        description: Publish target
+        required: true
+        default: testpypi
+        type: choice
+        options:
+        - testpypi
+        - pypi
+        - both
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: Build KunQuant-MLIR wheel
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+        cache: pip
+    - name: Install build dependencies
+      run: |
+        set -eux
+        python -m pip install --upgrade pip
+        python -m pip install cibuildwheel
+    - name: Read LLVM tag
+      id: llvm_tag
+      run: |
+        tag="$(sed -e 's/#.*//' -e '/^[[:space:]]*$/d' mlir/llvm_commit.txt | head -n1 | tr -d '[:space:]')"
+        test -n "$tag"
+        echo "tag=$tag" >> "$GITHUB_OUTPUT"
+        echo "LLVM tag: $tag"
+    - name: Download prebuilt LLVM/MLIR
+      env:
+        LLVM_TAG: ${{ steps.llvm_tag.outputs.tag }}
+      run: |
+        set -eux
+        mkdir -p "$RUNNER_TEMP/llvm-mlir"
+        curl -fL --retry 3 \
+          "https://github.com/Menooker/KunQuant/releases/download/llvm-mlir-${LLVM_TAG}/llvm-mlir-install-static-${LLVM_TAG}.tar.gz" \
+          -o "$RUNNER_TEMP/llvm-mlir.tar.gz"
+        tar -xzf "$RUNNER_TEMP/llvm-mlir.tar.gz" -C "$RUNNER_TEMP/llvm-mlir" --strip-components=1
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/mlir/MLIRConfig.cmake"
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/llvm/LLVMConfig.cmake"
+        echo "LLVM_PREFIX=$RUNNER_TEMP/llvm-mlir" >> "$GITHUB_ENV"
+    - name: Build wheel
+      run: |
+        set -eux
+        export CIBW_BUILD="cp312-manylinux_x86_64"
+        export CIBW_BUILD_FRONTEND="build"
+        export CIBW_MANYLINUX_X86_64_IMAGE="ghcr.io/menooker/kunquant-mlir:main"
+        export CIBW_REPAIR_WHEEL_COMMAND_LINUX="auditwheel repair --exclude libcuda.so --exclude libcuda.so.1 -w {dest_dir} {wheel}"
+        export CIBW_ENVIRONMENT="KUN_USE_GIT_VERSION=0 CUDA_PATH=/usr/local/cuda-13.2 CUDA_HOME=/usr/local/cuda-13.2 CUDAToolkit_ROOT=/usr/local/cuda-13.2 CMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc LLVM_DIR=/host${LLVM_PREFIX}/lib/cmake/llvm MLIR_DIR=/host${LLVM_PREFIX}/lib/cmake/mlir LLVM_EXTERNAL_LIT=lit LD_LIBRARY_PATH=/usr/local/cuda-13.2/lib64:/usr/local/cuda-13.2/lib64/stubs"
+        python -m cibuildwheel python/kunquant_mlir --platform linux --output-dir wheelhouse
+    - name: Check wheel contents
+      run: |
+        python - <<'PY'
+        import glob
+        import zipfile
+
+        wheel, = glob.glob("wheelhouse/*.whl")
+        with zipfile.ZipFile(wheel) as zf:
+            names = set(zf.namelist())
+        assert "KunQuantMLIR/KunMLIR.abi3.so" in names
+        assert "KunQuantMLIR/libKunCudaRuntime.so" in names
+        print(wheel)
+        PY
+    - name: Store wheel
+      uses: actions/upload-artifact@v4
+      with:
+        name: kunquant-mlir-wheel
+        path: wheelhouse/*.whl
+        if-no-files-found: error
+
+  publish-to-testpypi:
+    name: Publish KunQuant-MLIR to TestPyPI
+    if: ${{ inputs.target == 'testpypi' || inputs.target == 'both' }}
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/KunQuant-MLIR
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+    - name: Download wheel
+      uses: actions/download-artifact@v4
+      with:
+        name: kunquant-mlir-wheel
+        path: dist/
+    - name: Publish wheel to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
+
+  publish-to-pypi:
+    name: Publish KunQuant-MLIR to PyPI
+    if: ${{ inputs.target == 'pypi' || inputs.target == 'both' }}
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/KunQuant-MLIR
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+    - name: Download wheel
+      uses: actions/download-artifact@v4
+      with:
+        name: kunquant-mlir-wheel
+        path: dist/
+    - name: Publish wheel to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/python/kunquant_mlir/setup.py b/python/kunquant_mlir/setup.py
index d8e55ab..46d63c9 100644
--- a/python/kunquant_mlir/setup.py
+++ b/python/kunquant_mlir/setup.py
@@ -137,7 +137,6 @@ def finalize_options(self):
     python_requires=">=3.9",
     install_requires=[
         f"KunQuant=={version}",
-        "numpy",
     ],
     zip_safe=False,
 )

From ef1eab889e14a9578c8407f90682363232ceffb6 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Mon, 25 May 2026 20:34:11 -0700
Subject: [PATCH 57/59] copy buffer in bench

---
 CMakeLists.txt         |  4 +++-
 tests/test_alpha101.py | 23 ++++++++++++++++-------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca22587..ed21190 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,7 +130,9 @@ if(KUN_BUILD_MLIR)
         "nvcc used by CMake's CUDA-language support.")
   endif()
   get_filename_component(KUN_CUDA_TOOLKIT_ROOT
-                         "${CUDAToolkit_BIN_DIR}" DIRECTORY)
+                         "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+  get_filename_component(KUN_CUDA_TOOLKIT_ROOT
+                         "${KUN_CUDA_TOOLKIT_ROOT}" DIRECTORY)
   message(STATUS
       "KunQuant MLIR CUDA toolkit = ${KUN_CUDA_TOOLKIT_ROOT} "
       "(version ${CUDAToolkit_VERSION})")
diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index fd69256..41f6135 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -171,6 +171,7 @@ def create_multi_thread_executor(n):
     return _kr_mlir.Executor() if GPU_MODE else kr.createMultiThreadExecutor(n)
 
 gpu_inputs = None
+gpu_outputs = None
 def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None, **kwargs):
     if not GPU_MODE:
         return kr.runGraph(executor, modu, inputs, cur_time, length,
@@ -178,17 +179,25 @@ def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None,
     if cur_time != 0:
         raise RuntimeError("GPU alpha101 test only supports cur_time=0")
     global gpu_inputs
-    if not benchmode:
+    global gpu_outputs
+    if not benchmode or gpu_inputs is None:
         gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
-    ret = executor.runGraph(modu, gpu_inputs, cur_time=cur_time,
+    else:
+        for k, v in inputs.items():
+            gpu_inputs[k].set(v)
+    ret = executor.runGraph(modu, gpu_inputs, outputs=gpu_outputs, cur_time=cur_time,
                             length=length,
                             use_cuda_graph=USE_CUDA_GRAPH)
     if benchmode:
-        if USE_CUDA_GRAPH:
-            executor.synchronize()
-        return ret
-    executor.synchronize()
-
+        gpu_outputs = ret
+        out_np = {}
+        for k, v in ret.items():
+            arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+            ret[k] = arr
+            host = cp.asnumpy(arr, blocking=False)
+            out_np[k] = host
+        executor.synchronize()
+        return out_np
     out_np = {}
     for k, v in ret.items():
         arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)

From 2ad576264a4156bb8b5f2a30cc0004a6554774ec Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 26 May 2026 01:45:16 -0700
Subject: [PATCH 58/59] overlap runner

---
 KunQuantMLIR/OverlapRunner.py           | 280 ++++++++++++++++++++++++
 mlir/test/python/test_overlap_runner.py | 144 ++++++++++++
 tests/test_alpha101.py                  |  97 +++++---
 3 files changed, 489 insertions(+), 32 deletions(-)
 create mode 100644 KunQuantMLIR/OverlapRunner.py
 create mode 100644 mlir/test/python/test_overlap_runner.py

diff --git a/KunQuantMLIR/OverlapRunner.py b/KunQuantMLIR/OverlapRunner.py
new file mode 100644
index 0000000..4deef06
--- /dev/null
+++ b/KunQuantMLIR/OverlapRunner.py
@@ -0,0 +1,280 @@
+"""Pipelined CUDA runner for overlapping copies with KunMLIR launches."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Protocol, Tuple, Union
+
+import numpy as np
+
+try:
+    import cupy as cp
+except ImportError as exc:
+    raise ImportError(
+        "KunQuantMLIR.OverlapRunner requires CuPy for asynchronous CUDA "
+        "H2D/D2H copies. Install a CuPy build matching your CUDA runtime, "
+        "for example cupy-cuda12x."
+    ) from exc
+
+from KunQuantMLIR import KunMLIR
+
+
+CudaStream = Union[cp.cuda.Stream, cp.cuda.ExternalStream]
+
+
+class DLPackProvider(Protocol):
+    def __dlpack__(self) -> object:
+        ...
+
+
+def _stream_from_executor(executor: KunMLIR.Executor) -> CudaStream:
+    if executor.stream == 0:
+        return cp.cuda.Stream.null
+    return cp.cuda.ExternalStream(executor.stream)
+
+
+@dataclass
+class PendingResult:
+    """Host outputs from an asynchronous D2H copy.
+
+    The NumPy arrays are intentionally kept alive by this object until the
+    completion event has passed. Call ``wait()`` before reading them.
+    """
+
+    _output_names: List[str]
+    _host_output_block: np.ndarray
+    _done_event: cp.cuda.Event
+    _outputs: Optional[Dict[str, np.ndarray]] = None
+
+    def wait(self) -> Dict[str, np.ndarray]:
+        self._done_event.synchronize()
+        if self._outputs is None:
+            self._outputs = {
+                name: self._host_output_block[i]
+                for i, name in enumerate(self._output_names)
+            }
+        return self._outputs
+
+
+@dataclass
+class _Slot:
+    index: int
+    dev_inputs: Dict[str, cp.ndarray]
+    dev_outputs: Optional[Dict[str, cp.ndarray]]
+    output_length: Optional[int]
+    output_num_stocks: Optional[int]
+    graph_executable: Optional[KunMLIR.Executable]
+    h2d_event: cp.cuda.Event
+    compute_event: cp.cuda.Event
+    free_event: Optional[cp.cuda.Event]
+    h2d_sources: Optional[Dict[str, np.ndarray]]
+    host_output_block: Optional[np.ndarray]
+
+
+class OverlapRunner:
+    """Submit KunMLIR ``runGraph`` calls through a copy/compute pipeline.
+
+    The runner owns three non-blocking streams shared by all slots:
+    one for H2D input copies, one for the KunMLIR executor, and one for
+    D2H output copies. Slots only own reusable device input/output buffers
+    and references that must stay alive while async copies are in flight.
+    """
+
+    def __init__(self, executable: KunMLIR.Executable,
+                 executor: KunMLIR.Executor,
+                 num_slots: int = 3) -> None:
+        if num_slots < 2:
+            raise ValueError("OverlapRunner requires at least two slots")
+
+        self.executable = executable
+        self.executor = executor
+        self.compute_stream = _stream_from_executor(executor)
+        self.output_names = list(executable.output_names)
+
+        self.h2d_stream = cp.cuda.Stream(non_blocking=True)
+        self.d2h_stream = cp.cuda.Stream(non_blocking=True)
+
+        self._slots: List[_Slot] = [
+            _Slot(
+                index=i,
+                dev_inputs={},
+                dev_outputs=None,
+                output_length=None,
+                output_num_stocks=None,
+                graph_executable=None,
+                h2d_event=cp.cuda.Event(),
+                compute_event=cp.cuda.Event(),
+                free_event=None,
+                h2d_sources=None,
+                host_output_block=None,
+            )
+            for i in range(num_slots)
+        ]
+        self._next_slot = 0
+
+    @property
+    def num_slots(self) -> int:
+        return len(self._slots)
+
+    def submit(self, inputs: Dict[str, np.ndarray], cur_time: int = 0,
+               length: int = 0, mask: int = 0,
+               min_chunk_warmup_factor: int = 4,
+               sm_fill_factor: float = 1.5,
+               use_cuda_graph: bool = False) -> PendingResult:
+        slot = self._slots[self._next_slot]
+        self._next_slot = (self._next_slot + 1) % len(self._slots)
+
+        if slot.free_event is not None:
+            slot.free_event.synchronize()
+
+        host_inputs = self._prepare_host_inputs(inputs)
+        slot.h2d_sources = host_inputs
+        run_inputs, inputs_resized = self._copy_inputs_to_device(
+            slot, host_inputs)
+        output_length, num_stocks, output_dtype = self._output_spec(
+            host_inputs, length)
+        dev_outputs = self._cached_outputs_for_run(
+            slot, output_length, num_stocks, output_dtype, inputs_resized)
+
+        h2d_done = slot.h2d_event
+        h2d_done.record(self.h2d_stream)
+        slot.free_event = h2d_done
+
+        self.compute_stream.wait_event(h2d_done)
+        executable = self._executable_for_slot(slot, use_cuda_graph)
+        try:
+            ret = self.executor.runGraph(
+                executable,
+                run_inputs,
+                cur_time=cur_time,
+                length=length,
+                outputs=dev_outputs,
+                mask=mask,
+                min_chunk_warmup_factor=min_chunk_warmup_factor,
+                sm_fill_factor=sm_fill_factor,
+                use_cuda_graph=use_cuda_graph,
+            )
+        except Exception:
+            compute_done = slot.compute_event
+            compute_done.record(self.compute_stream)
+            slot.free_event = compute_done
+            raise
+
+        compute_done = slot.compute_event
+        compute_done.record(self.compute_stream)
+        slot.free_event = compute_done
+        slot.dev_outputs = self._to_cupy_outputs(ret)
+
+        self.d2h_stream.wait_event(compute_done)
+        host_output_block: Optional[np.ndarray] = None
+        try:
+            host_output_block = self._allocate_pinned_output_block(
+                slot.dev_outputs)
+            for i, name in enumerate(self.output_names):
+                dev_output = slot.dev_outputs[name]
+                cp.asnumpy(dev_output, stream=self.d2h_stream,
+                           out=host_output_block[i], blocking=False)
+        finally:
+            d2h_done = cp.cuda.Event()
+            d2h_done.record(self.d2h_stream)
+            slot.free_event = d2h_done
+            slot.host_output_block = host_output_block
+
+        result = PendingResult(self.output_names, host_output_block, d2h_done)
+        return result
+
+    def synchronize(self) -> None:
+        for slot in self._slots:
+            if slot.free_event is not None:
+                slot.free_event.synchronize()
+            slot.h2d_sources = None
+            slot.host_output_block = None
+
+    def _prepare_host_inputs(self, inputs: Dict[str, np.ndarray]
+                             ) -> Dict[str, np.ndarray]:
+        host_inputs: Dict[str, np.ndarray] = {}
+        for name, value in inputs.items():
+            arr = np.asarray(value)
+            if not arr.flags.c_contiguous:
+                arr = np.ascontiguousarray(arr)
+            host_inputs[name] = arr
+        return host_inputs
+
+    def _copy_inputs_to_device(self, slot: _Slot,
+                               host_inputs: Dict[str, np.ndarray]
+                               ) -> Tuple[Dict[str, cp.ndarray], bool]:
+        run_inputs: Dict[str, cp.ndarray] = {}
+        resized = False
+        for name, host in host_inputs.items():
+            dev = slot.dev_inputs.get(name)
+            if dev is None or dev.shape != host.shape or dev.dtype != host.dtype:
+                dev = cp.empty(host.shape, dtype=host.dtype)
+                slot.dev_inputs[name] = dev
+                resized = True
+            dev.set(host, stream=self.h2d_stream)
+            run_inputs[name] = dev
+        return run_inputs, resized
+
+    def _output_spec(self, host_inputs: Dict[str, np.ndarray],
+                     length: int) -> Tuple[int, int, np.dtype]:
+        if not host_inputs:
+            raise ValueError("OverlapRunner requires at least one input")
+        first = next(iter(host_inputs.values()))
+        if first.ndim != 2:
+            raise ValueError("OverlapRunner expects 2-D TS inputs")
+        output_length = first.shape[0] if length == 0 else length
+        return output_length, first.shape[1], first.dtype
+
+    def _cached_outputs_for_run(self, slot: _Slot,
+                                length: int,
+                                num_stocks: int,
+                                dtype: np.dtype,
+                                inputs_resized: bool
+                                ) -> Dict[str, cp.ndarray]:
+        needs_alloc = (
+            slot.dev_outputs is None or
+            slot.output_length != length or
+            slot.output_num_stocks != num_stocks or
+            inputs_resized
+        )
+        if needs_alloc:
+            slot.dev_outputs = {
+                name: cp.empty((length, num_stocks), dtype=dtype)
+                for name in self.output_names
+            }
+            slot.output_length = length
+            slot.output_num_stocks = num_stocks
+        return slot.dev_outputs
+
+    def _allocate_pinned_output_block(
+        self, dev_outputs: Dict[str, cp.ndarray]
+    ) -> np.ndarray:
+        if not self.output_names:
+            raise ValueError("OverlapRunner requires at least one output")
+        first = dev_outputs[self.output_names[0]]
+        shape = first.shape
+        dtype = np.dtype(first.dtype)
+        pinned = cp.cuda.alloc_pinned_memory(len(self.output_names) *
+                                            first.nbytes)
+        return np.frombuffer(
+            pinned, dtype=dtype, count=len(self.output_names) * first.size
+        ).reshape((len(self.output_names),) + shape)
+
+    def _to_cupy_outputs(self, outputs: Dict[str, Union[cp.ndarray,
+                                                        DLPackProvider]]
+                         ) -> Dict[str, cp.ndarray]:
+        ret: Dict[str, cp.ndarray] = {}
+        for name, value in outputs.items():
+            if isinstance(value, cp.ndarray):
+                ret[name] = value
+            else:
+                ret[name] = cp.from_dlpack(value)
+        return ret
+
+    def _executable_for_slot(self, slot: _Slot,
+                             use_cuda_graph: bool) -> KunMLIR.Executable:
+        if not use_cuda_graph:
+            return self.executable
+        if slot.graph_executable is None:
+            slot.graph_executable = self.executable.clone()
+        return slot.graph_executable
diff --git a/mlir/test/python/test_overlap_runner.py b/mlir/test/python/test_overlap_runner.py
new file mode 100644
index 0000000..3615bdc
--- /dev/null
+++ b/mlir/test/python/test_overlap_runner.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# RUN: %python %s --use-cuda-graph
+# REQUIRES: cuda-device
+"""Regression test for KunQuantMLIR.OverlapRunner.
+
+The test submits more runs than there are runner slots, while changing both
+the time length and the stock count. This exercises slot reuse, cached device
+output reallocation, CUDA graph state rebuild/update, and the host output block
+that is returned as per-output NumPy slices.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import textwrap
+from dataclasses import dataclass
+
+import numpy as np
+
+
+SAMPLE_KUNIR = textwrap.dedent("""
+gpu.module @kungpu_kernels {
+  kunir.func @overlap_runner_kernel(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"sum", "diff"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+      -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+    %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    %diff = kunir.sub %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %sum, %diff : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  }
+}
+""").strip()
+
+
+@dataclass
+class Case:
+    label: str
+    time_length: int
+    num_stocks: int
+    length_arg: int
+
+
+def make_inputs(case: Case, seed: int) -> dict[str, np.ndarray]:
+    rng = np.random.default_rng(seed)
+    return {
+        "a": rng.standard_normal(
+            (case.time_length, case.num_stocks), dtype=np.float32),
+        "b": rng.standard_normal(
+            (case.time_length, case.num_stocks), dtype=np.float32),
+    }
+
+
+def expected_outputs(inputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+    return {
+        "sum": inputs["a"] + inputs["b"],
+        "diff": inputs["a"] - inputs["b"],
+    }
+
+
+def check_outputs(case: Case, actual: dict[str, np.ndarray],
+                  expected: dict[str, np.ndarray]) -> None:
+    expected_shape = (case.time_length, case.num_stocks)
+    assert set(actual) == {"sum", "diff"}, actual.keys()
+    for name in ("sum", "diff"):
+        arr = actual[name]
+        assert arr.shape == expected_shape, (case, name, arr.shape)
+        assert arr.dtype == np.float32, (case, name, arr.dtype)
+        assert arr.flags.c_contiguous, (case, name, arr.strides)
+        assert not arr.flags.owndata, (case, name)
+        np.testing.assert_allclose(arr, expected[name], rtol=1e-6, atol=1e-6)
+
+
+def build_cases(base_time: int, base_stocks: int) -> list[Case]:
+    return [
+        Case("infer-initial", base_time, base_stocks, 0),
+        Case("explicit-length-change", base_time + 7, base_stocks,
+             base_time + 7),
+        Case("stock-count-change", max(8, base_time - 5),
+             base_stocks + 37, max(8, base_time - 5)),
+        Case("infer-both-change", base_time + 3, base_stocks + 79, 0),
+        Case("explicit-shorter-shape", max(8, base_time // 2),
+             max(8, base_stocks - 11), max(8, base_time // 2)),
+    ]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None)
+    ap.add_argument("-T", "--time-length", type=int, default=32)
+    ap.add_argument("-S", "--num-stocks", type=int, default=257)
+    ap.add_argument("--use-cuda-graph", action="store_true")
+    args = ap.parse_args()
+
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import find_cuda_toolkit
+    from KunQuant.jit.env import get_cuda_compute_capability
+    from KunQuantMLIR.OverlapRunner import OverlapRunner
+
+    import cupy as cp
+
+    args.target = args.target or get_cuda_compute_capability()
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    mod = KunMLIR.parse(SAMPLE_KUNIR)
+    exe = KunMLIR.compile(mod,
+                          graph_inputs=["a", "b"],
+                          graph_outputs=["sum", "diff"],
+                          gpu_arch=args.target, opt_level=3,
+                          toolkit_path=find_cuda_toolkit())
+    assert exe.output_names == ["sum", "diff"], exe.output_names
+
+    compute_stream = cp.cuda.Stream(non_blocking=True)
+    executor = KunMLIR.Executor(stream=compute_stream)
+    runner = OverlapRunner(exe, executor, num_slots=2)
+
+    print("=== overlap runner ===")
+    print(f"  target={args.target}  use_cuda_graph={args.use_cuda_graph}")
+    print(f"  executor.stream={hex(executor.stream)}")
+
+    pending = []
+    for i, case in enumerate(build_cases(args.time_length, args.num_stocks)):
+        inputs = make_inputs(case, seed=100 + i)
+        result = runner.submit(inputs,
+                               length=case.length_arg,
+                               use_cuda_graph=args.use_cuda_graph)
+        pending.append((case, inputs, result))
+        print(f"  submitted {case.label}: T={case.time_length}, "
+              f"S={case.num_stocks}, length={case.length_arg}")
+
+    for case, inputs, result in pending:
+        actual = result.wait()
+        check_outputs(case, actual, expected_outputs(inputs))
+        print(f"  ok {case.label}")
+
+    runner.synchronize()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index 41f6135..f931c52 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -38,7 +38,11 @@
     import cupy as cp
     from KunQuant.jit import KunMLIR as _kr_mlir
     from KunQuant.jit import cuda as _cuda_jit
-
+    from KunQuantMLIR.OverlapRunner import OverlapRunner
+    if BENCHMODE:
+        cuda_stream = cp.cuda.Stream(non_blocking=True)
+    else:
+        cuda_stream = None
     cp.cuda.Device(0).use()
     cp.zeros((1,), dtype=cp.float32)
 
@@ -164,40 +168,47 @@ def get_output_layout(modu):
 
 
 def create_single_thread_executor():
-    return _kr_mlir.Executor() if GPU_MODE else kr.createSingleThreadExecutor()
+    return _kr_mlir.Executor(cuda_stream) if GPU_MODE else kr.createSingleThreadExecutor()
 
 
 def create_multi_thread_executor(n):
-    return _kr_mlir.Executor() if GPU_MODE else kr.createMultiThreadExecutor(n)
+    return _kr_mlir.Executor(cuda_stream) if GPU_MODE else kr.createMultiThreadExecutor(n)
+
+GPU_OVERLAP_SLOTS = 3
+
+def warmup_gpu_overlap_runner(overlap_runner, modu, inputs,
+                              cur_time, length, **kwargs):
+    last = None
+    for _ in range(GPU_OVERLAP_SLOTS):
+        last = run_graph(overlap_runner.executor, True, modu, inputs,
+                         cur_time, length, None,
+                         overlap_runner=overlap_runner, **kwargs)
+    return last.wait() if last is not None else None
 
-gpu_inputs = None
-gpu_outputs = None
-def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None, **kwargs):
+def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None,
+              overlap_runner=None, **kwargs):
     if not GPU_MODE:
         return kr.runGraph(executor, modu, inputs, cur_time, length,
                            outputs if outputs is not None else {}, **kwargs)
     if cur_time != 0:
         raise RuntimeError("GPU alpha101 test only supports cur_time=0")
-    global gpu_inputs
-    global gpu_outputs
-    if not benchmode or gpu_inputs is None:
-        gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
-    else:
-        for k, v in inputs.items():
-            gpu_inputs[k].set(v)
-    ret = executor.runGraph(modu, gpu_inputs, outputs=gpu_outputs, cur_time=cur_time,
-                            length=length,
-                            use_cuda_graph=USE_CUDA_GRAPH)
+    kwargs.pop("skip_check", None)
+    kwargs.pop("num_stocks", None)
     if benchmode:
-        gpu_outputs = ret
-        out_np = {}
-        for k, v in ret.items():
-            arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
-            ret[k] = arr
-            host = cp.asnumpy(arr, blocking=False)
-            out_np[k] = host
-        executor.synchronize()
-        return out_np
+        if overlap_runner is None:
+            raise RuntimeError("GPU benchmark mode requires overlap_runner")
+        return overlap_runner.submit(
+            inputs,
+            cur_time=cur_time,
+            length=length,
+            use_cuda_graph=USE_CUDA_GRAPH,
+            **kwargs,
+        )
+    gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
+    ret = executor.runGraph(modu, gpu_inputs, cur_time=cur_time,
+                            length=length,
+                            use_cuda_graph=USE_CUDA_GRAPH,
+                            **kwargs)
     out_np = {}
     for k, v in ret.items():
         arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
@@ -396,18 +407,29 @@ def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ische
     
     if not ischeck:
         out = run_graph(executor, False, modu, my_input, start_time,
-                        num_time-start_time, outbuffers)
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None)
+        overlap_runner = None
+        if GPU_MODE:
+            overlap_runner = OverlapRunner(
+                modu, executor, num_slots=GPU_OVERLAP_SLOTS)
+            warmup_gpu_overlap_runner(overlap_runner, modu, my_input, start_time,
+                                      num_time-start_time)
         start = time.time()
         for _ in range(20):
             out = run_graph(executor, True, modu, my_input, start_time,
-                            num_time-start_time, outbuffers)
+                            num_time-start_time, outbuffers,
+                            overlap_runner=overlap_runner)
+        if GPU_MODE:
+            overlap_runner.synchronize()
+            out = out.wait()
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
         out = run_graph(executor, False, modu, my_input, start_time,
                         num_time-start_time, outbuffers,
-                        num_stocks=num_stock)
+                        overlap_runner=None, num_stocks=num_stock)
         end = time.time()
         tdiff = end-start
     print(f"Exec takes: {tdiff:.6f} seconds")
@@ -479,19 +501,29 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
     # blocked = TS_STs(inp)
     if not ischeck:
         out = run_graph(executor, False, modu, my_input, start_time,
-                        num_time-start_time, outbuffers)
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None)
+        overlap_runner = None
+        if GPU_MODE:
+            overlap_runner = OverlapRunner(
+                modu, executor, num_slots=GPU_OVERLAP_SLOTS)
+            warmup_gpu_overlap_runner(overlap_runner, modu, my_input, start_time,
+                                      num_time-start_time)
         start = time.time()
         for _ in range(20):
             out = run_graph(executor, True, modu, my_input, start_time,
-                            num_time-start_time, outbuffers)
+                            num_time-start_time, outbuffers,
+                            overlap_runner=overlap_runner)
         if GPU_MODE:
-            executor.synchronize()
+            overlap_runner.synchronize()
+            out = out.wait()
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
         out = run_graph(executor, False, modu, my_input, start_time,
-                        num_time-start_time, outbuffers)
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None)
         end = time.time()
         tdiff = end-start
     print(f"Exec takes: {tdiff:.6f} seconds")
@@ -550,6 +582,7 @@ def do_compile(avx, keep, tempdir):
                 continue
             kcfg = dataclasses.replace(kcfg, input_layout="TS",
                                        output_layout="TS",
+                                       partition_factor=2,
                                        blocking_len=1)
             gpu_funclist.append((name, f, kcfg))
         ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=GPU_ARCH)

From 00718c9d9d406306e587722e8d5cc5ae56dd65b7 Mon Sep 17 00:00:00 2001
From: Yijie Mei <yijiem@nvidia.com>
Date: Tue, 26 May 2026 01:56:11 -0700
Subject: [PATCH 59/59] bundle tests

---
 tests/test_alpha101.py | 7 +++++--
 tests/test_alpha158.py | 4 +++-
 tests/test_runtime.py  | 9 ++++++---
 tests/tests_gpu.sh     | 7 +++++++
 4 files changed, 21 insertions(+), 6 deletions(-)
 create mode 100644 tests/tests_gpu.sh

diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index f931c52..0f5c8ff 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -12,7 +12,7 @@
 from KunQuant.Stage import Function
 from KunQuant.predefined.Alpha101 import AllData, all_alpha
 from KunQuant.runner import KunRunner as kr
-from KunQuant.jit.env import cpu_arch
+from KunQuant.jit.env import cpu_arch, get_cuda_compute_capability
 
 isx86 = cpu_arch != "aarch64"
 
@@ -27,7 +27,10 @@
 
 _args, _ = _argp.parse_known_args()
 action = _args.action or ("run_gpu" if _args.gpu_arch else "avx2")
-GPU_ARCH = _args.gpu_arch or ("sm_80" if action == "run_gpu" else "")
+if _args.gpu_arch == "auto":
+    GPU_ARCH = get_cuda_compute_capability()
+else:
+    GPU_ARCH = _args.gpu_arch
 GPU_MODE = bool(GPU_ARCH)
 BENCHMODE = _args.benchmode
 USE_CUDA_GRAPH = _args.use_cuda_graph
diff --git a/tests/test_alpha158.py b/tests/test_alpha158.py
index 0a69470..5b6b11f 100644
--- a/tests/test_alpha158.py
+++ b/tests/test_alpha158.py
@@ -10,7 +10,7 @@
 from KunQuant.Op import Builder, Input, Output
 from KunQuant.Stage import Function
 from KunQuant.predefined.Alpha158 import AllData
-from KunQuant.jit.env import cpu_arch
+from KunQuant.jit.env import cpu_arch, get_cuda_compute_capability
 
 isx86 = cpu_arch != "aarch64"
 
@@ -223,6 +223,8 @@ def test(backend, inputs: Dict[str, np.ndarray],
         import cupy as cp
         cp.cuda.Device(0).use()
         cp.zeros((1,), dtype=cp.float64)
+        if args.gpu_arch == "auto":
+            args.gpu_arch = get_cuda_compute_capability()
         lib = check_alpha158(False, False, None, gpu_arch=args.gpu_arch)
         inp, ref = load(args.inputs, args.ref)
         test(_GpuBackend(lib, "alpha158"), inp, ref)
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index f4194d7..f1f5f99 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -14,7 +14,7 @@
 from KunQuant.predefined.Alpha101 import *
 from KunQuant.runner import KunRunner as kr
 import sys
-from KunQuant.jit.env import cpu_arch
+from KunQuant.jit.env import cpu_arch, get_cuda_compute_capability
 
 
 # ── Backend dispatch (CPU vs GPU) ────────────────────────────────────
@@ -29,8 +29,11 @@
 _argp.add_argument("--gpu-arch", default="",
                     help="GPU compute capability (e.g. sm_80).  Empty = CPU.")
 _args, _ = _argp.parse_known_args()
-GPU_MODE = bool(_args.gpu_arch)
-GPU_ARCH = _args.gpu_arch
+if _args.gpu_arch == "auto":
+    GPU_ARCH = get_cuda_compute_capability()
+else:
+    GPU_ARCH = _args.gpu_arch
+GPU_MODE = bool(GPU_ARCH)
 
 if GPU_MODE:
     import cupy as cp
diff --git a/tests/tests_gpu.sh b/tests/tests_gpu.sh
new file mode 100644
index 0000000..4420b4f
--- /dev/null
+++ b/tests/tests_gpu.sh
@@ -0,0 +1,7 @@
+set -e
+echo "KunQuant runtime tests"
+python tests/test_runtime.py --gpu-arch auto
+echo "KunQuant alpha158 tests"
+python ./tests/test_alpha158.py --inputs ./build/input.npz --ref ./build/alpha158.npz --action run_gpu --gpu-arch auto
+echo "KunQuant alpha101 tests"
+python ./tests/test_alpha101.py --gpu-arch auto
\ No newline at end of file