From 18caf5fd579de19af037c6635bed9a0bdaf91c8a Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Thu, 2 Apr 2026 17:01:26 +0800
Subject: [PATCH 01/16] test: add qwen3 scope2 pto kernels

---
 .github/workflows/ci.yml                      |  11 +-
 test/samples/Qwen3Scope2/README.md            |  11 ++
 .../Qwen3Scope2/decode_attention_incore_0.pto |  25 ++++
 .../Qwen3Scope2/decode_attention_incore_1.pto |  58 +++++++++
 .../decode_attention_incore_10.pto            |  30 +++++
 .../decode_attention_incore_11.pto            | 111 ++++++++++++++++++
 .../decode_attention_incore_12.pto            |  28 +++++
 .../Qwen3Scope2/decode_attention_incore_2.pto |  39 ++++++
 .../Qwen3Scope2/decode_attention_incore_3.pto |  26 ++++
 .../Qwen3Scope2/decode_attention_incore_4.pto |  64 ++++++++++
 .../Qwen3Scope2/decode_attention_incore_5.pto |  30 +++++
 .../Qwen3Scope2/decode_attention_incore_6.pto |  18 +++
 .../Qwen3Scope2/decode_attention_incore_7.pto |  30 +++++
 .../Qwen3Scope2/decode_attention_incore_8.pto |  49 ++++++++
 .../Qwen3Scope2/decode_attention_incore_9.pto |  18 +++
 test/samples/runop.sh                         |  11 +-
 16 files changed, 555 insertions(+), 4 deletions(-)
 create mode 100644 test/samples/Qwen3Scope2/README.md
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_0.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_1.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_10.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_11.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_12.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_2.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_3.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_4.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_5.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_6.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_7.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_8.pto
 create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_9.pto

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb4749ec0..01ab24d35 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,7 +33,7 @@ on:
       skip_cases:
         description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)"
         type: string
-        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp"
+        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12"
       run_only_cases:
         description: "Comma/space separated testcase names to run (empty = run all)"
         type: string
@@ -261,7 +261,14 @@ jobs:
       # Temporary CI gate: skip cases that still error/flap on the remote NPU.
       # Update this list as we fix the underlying issues.
       DEFAULT_SKIP_CASES: >-
-        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp
+        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,
+        decode_attention_incore_0,decode_attention_incore_1,
+        decode_attention_incore_2,decode_attention_incore_3,
+        decode_attention_incore_4,decode_attention_incore_5,
+        decode_attention_incore_6,decode_attention_incore_7,
+        decode_attention_incore_8,decode_attention_incore_9,
+        decode_attention_incore_10,decode_attention_incore_11,
+        decode_attention_incore_12
     steps:
       - name: Resolve validation parameters
         shell: bash
diff --git a/test/samples/Qwen3Scope2/README.md b/test/samples/Qwen3Scope2/README.md
new file mode 100644
index 000000000..978e54ad0
--- /dev/null
+++ b/test/samples/Qwen3Scope2/README.md
@@ -0,0 +1,11 @@
+Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS`
+
+Notes:
+- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file.
+- This sample directory vendors only the kernel `.pto` inputs.
+- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files.
+- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed.
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
new file mode 100644
index 000000000..d9df6b9eb
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
@@ -0,0 +1,25 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_0(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.muli %ki__idx_v0, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
new file mode 100644
index 000000000..d79076b4f
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
@@ -0,0 +1,58 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c3072 = arith.constant 3072 : i64
+  %c5120 = arith.constant 5120 : i64
+  %c7168 = arith.constant 7168 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tstore ins(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>)
+  %k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tstore ins(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
new file mode 100644
index 000000000..142c570b8
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+  pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto
new file mode 100644
index 000000000..17eae5c2b
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto
@@ -0,0 +1,111 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_11(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c64 = arith.constant 64 : i64
+  %c96 = arith.constant 96 : i64
+  %c128 = arith.constant 128 : i64
+  %c4224 = arith.constant 4224 : i64
+  %c8320 = arith.constant 8320 : i64
+  %c8352 = arith.constant 8352 : i64
+  %c8384 = arith.constant 8384 : i64
+  %c8416 = arith.constant 8416 : i64
+  %c8448 = arith.constant 8448 : i64
+  %c8 = arith.constant 8 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %arg6, %c0 : index
+  %li__phi_v5, %mi__phi_v5, %oi__phi_v5 = scf.if %8 -> (!pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>) {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<8x1xf32>)
+    %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<8x1xf32>)
+    %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+    pto.tstore ins(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<8x128xf32>)
+    scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %13 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%13 : !pto.partition_tensor_view<8x1xf32>)
+    %15 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%mi_new__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%15 : !pto.partition_tensor_view<8x1xf32>)
+    %17 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%17 : !pto.partition_tensor_view<8x128xf32>)
+    scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto
new file mode 100644
index 000000000..124078522
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto
@@ -0,0 +1,28 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_12(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c1 = arith.constant 1 : index
+  %c8192 = arith.constant 8192 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c1024 = arith.constant 1024 : index
+  %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %0 = arith.muli %arg3, %c128 : index
+  %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x1024xf32>
+  pto.tstore ins(%ctx_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto
new file mode 100644
index 000000000..5419f419a
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto
@@ -0,0 +1,39 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_2(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c512 = arith.constant 512 : i64
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c4096 = arith.constant 4096 : index
+  %k_cache__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %k_rot_tensor__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %2 = arith.muli %arg4, %c8 : index
+    %3 = arith.muli %2, %c4096 : index
+    %4 = arith.muli %ki__idx_v0, %c4096 : index
+    %5 = arith.addi %3, %4 : index
+    %6 = arith.addi %5, %arg5 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_rot_tensor__ssa_v2_pview = pto.partition_view %k_rot_tensor__ssa_v2_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%k_rot_tensor__ssa_v2_pview : !pto.partition_tensor_view<1x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
+    %1 = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %7 = arith.muli %ki__idx_v0, %c128 : index
+    %v_proj__ssa_v0_pview = pto.partition_view %v_proj__ssa_v0_view, offsets = [%arg4, %7], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%v_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto
new file mode 100644
index 000000000..143c98a4b
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto
@@ -0,0 +1,26 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_3(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c0 = arith.constant 0 : index
+  %q_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %qi__idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.addi %arg3, %qi__idx_v0 : index
+    %2 = arith.muli %1, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %q_proj__ssa_v0_pview = pto.partition_view %q_proj__ssa_v0_view, offsets = [%arg2, %2], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%q_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_group__iter_v1_pview = pto.partition_view %q_group__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto
new file mode 100644
index 000000000..9de52a73c
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto
@@ -0,0 +1,64 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_4(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c3072 = arith.constant 3072 : i64
+  %c5120 = arith.constant 5120 : i64
+  %c7168 = arith.constant 7168 : i64
+  %c9216 = arith.constant 9216 : i64
+  %c10240 = arith.constant 10240 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_rot_bf16__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %q_group__rv_v2_pview = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%q_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%q_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %3 = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%q_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_lo_bf16__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%q_rot_lo__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_hi_bf16__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%q_rot_hi__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_bf16__ssa_v0_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%q_rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
+  %q_rot_bf16__tile_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%q_rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_bf16__tile_pview : !pto.partition_tensor_view<8x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto
new file mode 100644
index 000000000..28ad1932e
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_5(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c4128 = arith.constant 4128 : i64
+  %c8 = arith.constant 8 : index
+  %c1 = arith.constant 1 : index
+  %c128 = arith.constant 128 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %ret0__out_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li_flat__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi_flat__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tstore ins(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto
new file mode 100644
index 000000000..26e9555c4
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto
@@ -0,0 +1,18 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_6(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %q_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_rot_bf16__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_bf16_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %q_rot_bf16__ssa_v2_pview = pto.partition_view %q_rot_bf16__ssa_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x128xbf16>
+  pto.tload ins(%q_rot_bf16__ssa_v2_pview : !pto.partition_tensor_view<8x128xbf16>) outs(%q_bf16_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_padded__ssa_v0_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x128xbf16>
+  pto.tstore ins(%q_bf16_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__ssa_v0_pview : !pto.partition_tensor_view<8x128xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto
new file mode 100644
index 000000000..282f797e5
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_7(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xbf16>
+  %q_padded__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %q_padded__ssa_v1_pview = pto.partition_view %q_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%q_padded__ssa_v1_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto
new file mode 100644
index 000000000..f968b1627
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto
@@ -0,0 +1,49 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_8(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c8224 = arith.constant 8224 : i64
+  %c9248 = arith.constant 9248 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 8.838835e-02 : f32
+  %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%scores_padded__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, f32) outs(%scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_fp32__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto
new file mode 100644
index 000000000..0c16cfc61
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto
@@ -0,0 +1,18 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_9(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %exp_scores_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %exp_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %exp_scores_bf16__ssa_v0_pview = pto.partition_view %exp_scores_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tload ins(%exp_scores_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) outs(%exp_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%exp_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index a57efc8a6..a3839b029 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -19,7 +19,7 @@ PYTHON_BIN="${PYTHON_BIN:-}"
 PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}"
 PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}"
 PTOAS_FLAGS="${PTOAS_FLAGS:-}"
-PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync}"
+PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Scope2}"
 ENABLE_BC=0
 
 usage() {
@@ -36,7 +36,7 @@ Env:
   PTOAS_OUT_DIR  # where generated *.mlir/*.cpp go (optional; defaults to a temp dir)
   PTOAS_FLAGS  # extra flags passed to ptoas (e.g. --enable-insert-sync)
   PTOAS_ENABLE_INSERT_SYNC  # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1)
-  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync)
+  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync Qwen3Scope2)
 
 Flags:
   --enablebc  # enable: python -> .pto -> ptobc -> .pto -> ptoas
@@ -172,15 +172,22 @@ process_one_dir() {
   fi
 
   local target_arch="a3"
+  local has_pto_arch_override=0
   if ((${#ptoas_flags[@]})); then
     for ((idx=0; idx<${#ptoas_flags[@]}; ++idx)); do
       if [[ "${ptoas_flags[idx]}" == "--pto-arch" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then
         target_arch="${ptoas_flags[idx + 1]}"
+        has_pto_arch_override=1
       elif [[ "${ptoas_flags[idx]}" == --pto-arch=* ]]; then
         target_arch="${ptoas_flags[idx]#--pto-arch=}"
+        has_pto_arch_override=1
       fi
     done
   fi
+  if [[ "$A" == "Qwen3Scope2" && $has_pto_arch_override -eq 0 ]]; then
+    ptoas_flags+=(--pto-arch a5 --pto-level=level3)
+    target_arch="a5"
+  fi
   local expected_vec_barrier="pipe_barrier(PIPE_V)"
   local skip_vec_barrier=0
   if [[ "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" == "a5" ]]; then

From 45dbf6790cf89059981cfd5330eee588addaf539 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Thu, 2 Apr 2026 19:27:17 +0800
Subject: [PATCH 02/16] fix(test): skip ptobc roundtrip for qwen scope2

---
 test/samples/runop.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index a3839b029..b8c02ff00 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -153,6 +153,12 @@ process_one_dir() {
   if [[ "${ENABLE_BC}" == "1" ]]; then
     use_ptobc_roundtrip=1
   fi
+  # Qwen3 scope2 kernels currently serve as direct ptoas compile-regression
+  # coverage. They require A5/level3 lowering, but are not expected to
+  # roundtrip through ptobc yet.
+  if [[ "$A" == "Qwen3Scope2" ]]; then
+    use_ptobc_roundtrip=0
+  fi
   local -a ptoas_flags=()
   if [[ -n "${PTOAS_FLAGS}" ]]; then
     # shellcheck disable=SC2206
@@ -910,7 +916,6 @@ PY
       if [[ "$base" == "test_if_else_tile_result" ]]; then
         sample_use_ptobc_roundtrip=0
       fi
-
       if [[ $sample_use_ptobc_roundtrip -eq 1 ]]; then
         # Allow generic escape for ops that are not yet in the compact v0 opcode table.
         if ! PTOBC_ALLOW_GENERIC=1 "$ptobc" encode "$f" -o "$ptobc_file" >/dev/null 2>&1; then

From 14b4c236f3ff74e824d4ec7a5b1c06df909b8533 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 09:57:11 +0800
Subject: [PATCH 03/16] test: switch qwen PTO samples to tilelet

---
 test/samples/Qwen3Scope2/README.md            |  11 --
 .../Qwen3Scope2/decode_attention_incore_0.pto |  25 ---
 .../Qwen3Scope2/decode_attention_incore_1.pto |  58 -------
 .../decode_attention_incore_10.pto            |  30 ----
 .../decode_attention_incore_11.pto            | 111 -------------
 .../decode_attention_incore_12.pto            |  28 ----
 .../Qwen3Scope2/decode_attention_incore_2.pto |  39 -----
 .../Qwen3Scope2/decode_attention_incore_3.pto |  26 ---
 .../Qwen3Scope2/decode_attention_incore_4.pto |  64 --------
 .../Qwen3Scope2/decode_attention_incore_5.pto |  30 ----
 .../Qwen3Scope2/decode_attention_incore_6.pto |  18 ---
 .../Qwen3Scope2/decode_attention_incore_7.pto |  30 ----
 .../Qwen3Scope2/decode_attention_incore_8.pto |  49 ------
 .../Qwen3Scope2/decode_attention_incore_9.pto |  18 ---
 test/samples/Qwen3Tilelet/README.md           |  13 ++
 .../qwen3_decode_layer_incore_1.pto           | 116 ++++++++++++++
 .../qwen3_decode_layer_incore_10.pto          | 108 +++++++++++++
 .../qwen3_decode_layer_incore_13.pto          | 116 ++++++++++++++
 .../qwen3_decode_layer_incore_14.pto          |  73 +++++++++
 .../qwen3_decode_layer_incore_2.pto           | 148 ++++++++++++++++++
 test/samples/runop.sh                         |  17 +-
 21 files changed, 586 insertions(+), 542 deletions(-)
 delete mode 100644 test/samples/Qwen3Scope2/README.md
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_0.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_1.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_10.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_11.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_12.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_2.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_3.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_4.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_5.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_6.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_7.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_8.pto
 delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_9.pto
 create mode 100644 test/samples/Qwen3Tilelet/README.md
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto

diff --git a/test/samples/Qwen3Scope2/README.md b/test/samples/Qwen3Scope2/README.md
deleted file mode 100644
index 978e54ad0..000000000
--- a/test/samples/Qwen3Scope2/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`.
-
-Scope:
-- compile-regression inputs for `ptoas`
-- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS`
-
-Notes:
-- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file.
-- This sample directory vendors only the kernel `.pto` inputs.
-- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files.
-- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed.
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
deleted file mode 100644
index d9df6b9eb..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
+++ /dev/null
@@ -1,25 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_0(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c256 = arith.constant 256 : i64
-  %c8 = arith.constant 8 : index
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
-    %1 = arith.muli %ki__idx_v0, %c128 : index
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
-    pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
deleted file mode 100644
index d79076b4f..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
+++ /dev/null
@@ -1,58 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c256 = arith.constant 256 : i64
-  %c512 = arith.constant 512 : i64
-  %c768 = arith.constant 768 : i64
-  %c1024 = arith.constant 1024 : i64
-  %c3072 = arith.constant 3072 : i64
-  %c5120 = arith.constant 5120 : i64
-  %c7168 = arith.constant 7168 : i64
-  %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c8 = arith.constant 8 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tstore ins(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>)
-  %k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tstore ins(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
deleted file mode 100644
index 142c570b8..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
+++ /dev/null
@@ -1,30 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c16384 = arith.constant 16384 : i64
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c1 = arith.constant 1 : index
-  %c524288 = arith.constant 524288 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
-  pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
-  pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-  pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto
deleted file mode 100644
index 17eae5c2b..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto
+++ /dev/null
@@ -1,111 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_11(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c32 = arith.constant 32 : i64
-  %c64 = arith.constant 64 : i64
-  %c96 = arith.constant 96 : i64
-  %c128 = arith.constant 128 : i64
-  %c4224 = arith.constant 4224 : i64
-  %c8320 = arith.constant 8320 : i64
-  %c8352 = arith.constant 8352 : i64
-  %c8384 = arith.constant 8384 : i64
-  %c8416 = arith.constant 8416 : i64
-  %c8448 = arith.constant 8448 : i64
-  %c8 = arith.constant 8 : index
-  %c1 = arith.constant 1 : index
-  %7 = arith.constant 128 : index
-  %c16 = arith.constant 16 : index
-  %c0 = arith.constant 0 : index
-  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %8 = arith.cmpi eq, %arg6, %c0 : index
-  %li__phi_v5, %mi__phi_v5, %oi__phi_v5 = scf.if %8 -> (!pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>) {
-    %oi__ssa_v3 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %mi__ssa_v3 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-    pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<8x1xf32>)
-    %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-    pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<8x1xf32>)
-    %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-    pto.tstore ins(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<8x128xf32>)
-    scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>
-  } else {
-    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %mi_new__tile = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %alpha__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %beta__tile = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %3 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %mi__ssa_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %13 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%13 : !pto.partition_tensor_view<8x1xf32>)
-    %15 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-    pto.tstore ins(%mi_new__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%15 : !pto.partition_tensor_view<8x1xf32>)
-    %17 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%17 : !pto.partition_tensor_view<8x128xf32>)
-    scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto
deleted file mode 100644
index 124078522..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto
+++ /dev/null
@@ -1,28 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_12(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c32 = arith.constant 32 : i64
-  %c1 = arith.constant 1 : index
-  %c8192 = arith.constant 8192 : index
-  %c8 = arith.constant 8 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %c1024 = arith.constant 1024 : index
-  %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %0 = arith.muli %arg3, %c128 : index
-  %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x1024xf32>
-  pto.tstore ins(%ctx_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto
deleted file mode 100644
index 5419f419a..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto
+++ /dev/null
@@ -1,39 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_2(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c512 = arith.constant 512 : i64
-  %c524288 = arith.constant 524288 : index
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c16 = arith.constant 16 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %c4096 = arith.constant 4096 : index
-  %k_cache__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %k_rot_tensor__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %v_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_proj__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
-    %2 = arith.muli %arg4, %c8 : index
-    %3 = arith.muli %2, %c4096 : index
-    %4 = arith.muli %ki__idx_v0, %c4096 : index
-    %5 = arith.addi %3, %4 : index
-    %6 = arith.addi %5, %arg5 : index
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %k_rot_tensor__ssa_v2_pview = pto.partition_view %k_rot_tensor__ssa_v2_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-    pto.tload ins(%k_rot_tensor__ssa_v2_pview : !pto.partition_tensor_view<1x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
-    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
-    %1 = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %7 = arith.muli %ki__idx_v0, %c128 : index
-    %v_proj__ssa_v0_pview = pto.partition_view %v_proj__ssa_v0_view, offsets = [%arg4, %7], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
-    pto.tload ins(%v_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
-    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto
deleted file mode 100644
index 143c98a4b..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto
+++ /dev/null
@@ -1,26 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_3(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c256 = arith.constant 256 : i64
-  %c8 = arith.constant 8 : index
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c0 = arith.constant 0 : index
-  %q_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %q_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  scf.for %qi__idx_v0 = %c0 to %c8 step %c1 {
-    %1 = arith.addi %arg3, %qi__idx_v0 : index
-    %2 = arith.muli %1, %c128 : index
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %q_proj__ssa_v0_pview = pto.partition_view %q_proj__ssa_v0_view, offsets = [%arg2, %2], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
-    pto.tload ins(%q_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %q_group__iter_v1_pview = pto.partition_view %q_group__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto
deleted file mode 100644
index 9de52a73c..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto
+++ /dev/null
@@ -1,64 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_4(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c256 = arith.constant 256 : i64
-  %c512 = arith.constant 512 : i64
-  %c768 = arith.constant 768 : i64
-  %c1024 = arith.constant 1024 : i64
-  %c3072 = arith.constant 3072 : i64
-  %c5120 = arith.constant 5120 : i64
-  %c7168 = arith.constant 7168 : i64
-  %c9216 = arith.constant 9216 : i64
-  %c10240 = arith.constant 10240 : i64
-  %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c8 = arith.constant 8 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %q_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %q_rot_bf16__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %q_group__rv_v2_pview = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tload ins(%q_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%q_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %3 = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%q_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_rot_lo_bf16__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcvt ins(%q_rot_lo__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_rot_hi_bf16__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcvt ins(%q_rot_hi__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_rot_bf16__ssa_v0_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
-  pto.tstore ins(%q_rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
-  %q_rot_bf16__tile_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
-  pto.tstore ins(%q_rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_bf16__tile_pview : !pto.partition_tensor_view<8x64xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto
deleted file mode 100644
index 28ad1932e..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto
+++ /dev/null
@@ -1,30 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_5(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c4128 = arith.constant 4128 : i64
-  %c8 = arith.constant 8 : index
-  %c1 = arith.constant 1 : index
-  %c128 = arith.constant 128 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %ret0__out_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret1__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret2__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %oi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %li_flat__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %li__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %mi_flat__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %mi__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tstore ins(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto
deleted file mode 100644
index 26e9555c4..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto
+++ /dev/null
@@ -1,18 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_6(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c16 = arith.constant 16 : index
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %q_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %q_rot_bf16__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %q_bf16_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %q_rot_bf16__ssa_v2_pview = pto.partition_view %q_rot_bf16__ssa_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x128xbf16>
-  pto.tload ins(%q_rot_bf16__ssa_v2_pview : !pto.partition_tensor_view<8x128xbf16>) outs(%q_bf16_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_padded__ssa_v0_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x128xbf16>
-  pto.tstore ins(%q_bf16_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__ssa_v0_pview : !pto.partition_tensor_view<8x128xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto
deleted file mode 100644
index 282f797e5..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto
+++ /dev/null
@@ -1,30 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_7(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c16384 = arith.constant 16384 : i64
-  %c524288 = arith.constant 524288 : index
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c0 = arith.constant 0 : index
-  %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xbf16>
-  %q_padded__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-  pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %q_padded__ssa_v1_pview = pto.partition_view %q_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-  pto.tload ins(%q_padded__ssa_v1_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto
deleted file mode 100644
index f968b1627..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto
+++ /dev/null
@@ -1,49 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_8(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c2048 = arith.constant 2048 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c8192 = arith.constant 8192 : i64
-  %c8224 = arith.constant 8224 : i64
-  %c9248 = arith.constant 9248 : i64
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 8.838835e-02 : f32
-  %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret1__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret2__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%scores_padded__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, f32) outs(%scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tcvt ins(%exp_scores__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_fp32__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
-  pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x64xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto
deleted file mode 100644
index 0c16cfc61..000000000
--- a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto
+++ /dev/null
@@ -1,18 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @decode_attention_incore_9(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %exp_scores_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %exp_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %exp_scores_bf16__ssa_v0_pview = pto.partition_view %exp_scores_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
-  pto.tload ins(%exp_scores_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) outs(%exp_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
-  pto.tstore ins(%exp_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
new file mode 100644
index 000000000..34e3a51bf
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -0,0 +1,13 @@
+Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_tilelet.py`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS`
+
+Notes:
+- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs:
+  `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`,
+  `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`,
+  `qwen3_decode_layer_incore_14`.
+- This sample directory vendors only those direct `ptoas` regression inputs.
+- No custom `golden.py` or `compare.py` is included here: these grouped mixed kernels depend on orchestration-managed peer buffers and loop-carried context, so per-kernel numerical validation is not a drop-in replacement for the full PyPTO runtime flow.
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
new file mode 100644
index 000000000..77b0b5c33
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
@@ -0,0 +1,116 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_1_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c4096 = arith.constant 4096 : i64
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_1_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", size = 4096, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
+  %qwen3_decode_layer_incore_1_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg6, %c4 : index
+    %1 = arith.addi %0, %ob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %5 = arith.muli %kb__idx_v0, %c128 : index
+      %wq_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wq_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %t__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %t__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%t__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%t__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tfree_from_aiv {split = 0}
+      %wq_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%wq_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wq_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%t__tile_Left, %wq_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_1_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c4096 = arith.constant 4096 : i64
+  %c4128 = arith.constant 4128 : i64
+  %c5152 = arith.constant 5152 : i64
+  %c6176 = arith.constant 6176 : i64
+  %c8224 = arith.constant 8224 : i64
+  %c8736 = arith.constant 8736 : i64
+  %c10784 = arith.constant 10784 : i64
+  %c11808 = arith.constant 11808 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_1_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aic} -> i32
+  %qwen3_decode_layer_incore_1_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", size = 4096, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer_import : i32)
+  %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x1xf32>
+  pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %5 = arith.muli %arg6, %c4 : index
+    %6 = arith.addi %5, %ob__ci_idx_v0 : index
+    %7 = arith.muli %6, %c1 : index
+    %8 = arith.addi %c0, %7 : index
+    %9 = arith.muli %8, %c64 : index
+    %q_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%q_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %10 = arith.muli %kb__idx_v0, %c128 : index
+      %t__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg5, %10], sizes = [%c4, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x128xbf16>
+      pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %x_chunk__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %gamma__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %10], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+      pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %1 = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcolexpandmul ins(%1, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %2 = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %t__tile_nz = pto.alloc_tile addr = %c8736 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tpush_to_aic(%t__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %3 = pto.alloc_tile addr = %c10784 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %q_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %4 = pto.alloc_tile addr = %c11808 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%0{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_proj__co_l1_iter_v3_pview = pto.partition_view %q_proj__co_l0_iter_v3_view, offsets = [%arg5, %9], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
+    pto.tstore ins(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
new file mode 100644
index 000000000..636b81393
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
@@ -0,0 +1,108 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_10_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c4096 = arith.constant 4096 : i64
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_10_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", size = 4096, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
+  %qwen3_decode_layer_incore_10_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %0 = arith.muli %arg5, %c8 : index
+    %1 = arith.addi %0, %ob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %5 = arith.muli %kb__idx_v0, %c128 : index
+      %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %a_chunk__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%a_chunk__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%a_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tfree_from_aiv {split = 0}
+      %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_10_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c4096 = arith.constant 4096 : i64
+  %c5120 = arith.constant 5120 : i64
+  %c7168 = arith.constant 7168 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c10240 = arith.constant 10240 : i64
+  %c11264 = arith.constant 11264 : i64
+  %c9216 = arith.constant 9216 : i64
+  %c16 = arith.constant 16 : index
+  %4 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%4, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_10_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aic} -> i32
+  %qwen3_decode_layer_incore_10_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", size = 4096, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer_import : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %5 = arith.muli %arg5, %c8 : index
+    %6 = arith.addi %5, %ob__ci_idx_v0 : index
+    %7 = arith.muli %6, %c1 : index
+    %8 = arith.addi %c0, %7 : index
+    %9 = arith.muli %8, %c64 : index
+    %o_acc__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%o_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %10 = arith.muli %kb__idx_v0, %c128 : index
+      %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %attn_out__rv_v2_pview = pto.partition_view %attn_out__rv_v2_view, offsets = [%arg4, %10], sizes = [%c4, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x128xf32>
+      pto.tload ins(%attn_out__rv_v2_pview : !pto.partition_tensor_view<4x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %a_chunk__tile = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%a_chunk__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %a_chunk__tile_nz = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%a_chunk__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%a_chunk__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tpush_to_aic(%a_chunk__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %1 = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %o_acc__tile_mv = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %9], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
+    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%2{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%0, %resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid1_tile__co_l1_iter_v1_pview = pto.partition_view %resid1_tile__co_l0_iter_v1_view, offsets = [%c0, %9], sizes = [%c4, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x64xf32>
+    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__co_l1_iter_v1_pview : !pto.partition_tensor_view<4x64xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
new file mode 100644
index 000000000..6eedee90d
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
@@ -0,0 +1,116 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_13_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c17408 = arith.constant 17408 : i64
+  %c4 = arith.constant 4 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c5120 = arith.constant 5120 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c40 = arith.constant 40 : index
+    %c128 = arith.constant 128 : index
+  %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_13_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_13_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c4, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x128xbf16>
+    pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wu__tile = pto.alloc_tile addr = %c17408 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%post_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wg__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wu__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_13_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c8192 = arith.constant 8192 : i64
+  %c9216 = arith.constant 9216 : i64
+  %c11264 = arith.constant 11264 : i64
+  %c12288 = arith.constant 12288 : i64
+  %c10240 = arith.constant 10240 : i64
+  %c13312 = arith.constant 13312 : i64
+  %c4 = arith.constant 4 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c5120 = arith.constant 5120 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c40 = arith.constant 40 : index
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_13_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", size = 8192, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  %gate_acc__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %gate_acc__ssa_v0_pview = pto.partition_view %gate_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x64xf32>
+  pto.tload ins(%gate_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %up_acc__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %up_acc__ssa_v0_pview = pto.partition_view %up_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x64xf32>
+  pto.tload ins(%up_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%gate_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%up_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %3 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c12288 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%1, %3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %gate_acc__tile_mv = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gate_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %up_acc__tile_mv = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%up_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %t__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tneg ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %5 = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %6 = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%5, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sigmoid__tile = pto.alloc_tile addr = %c11264 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trecip ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %7 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%0, %sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%7, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c13312 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%mlp_chunk__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
+  pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<4x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
new file mode 100644
index 000000000..725a529d2
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
@@ -0,0 +1,73 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c4 = arith.constant 4 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_14_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_14_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+    pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
+    pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_down_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c16384 = arith.constant 16384 : i64
+  %c4 = arith.constant 4 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 16384, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %down_prev__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x128xf32>
+    pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<4x128xf32>) outs(%down_prev__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_next__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x128xf32>
+    pto.tstore ins(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.partition_tensor_view<4x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
new file mode 100644
index 000000000..dc6456847
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
@@ -0,0 +1,148 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_2_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: !pto.ptr<bf16>, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c4096 = arith.constant 4096 : i64
+  %c20480 = arith.constant 20480 : i64
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_2_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", size = 4096, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
+  %qwen3_decode_layer_incore_2_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.muli %arg8, %c8 : index
+    %2 = arith.addi %1, %ob__ci_idx_v0 : index
+    %3 = arith.muli %2, %c1 : index
+    %4 = arith.addi %c0, %3 : index
+    %5 = arith.muli %4, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %6 = arith.muli %kb__idx_v0, %c128 : index
+      %wk_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wk_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %wv_chunk__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wv_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %normed_bf16__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %normed_bf16__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%normed_bf16__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%normed_bf16__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tfree_from_aiv {split = 0}
+      %wk_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%wk_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wk_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%normed_bf16__tile_Left, %wk_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+      %wv_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%wv_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wv_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%normed_bf16__tile_Left, %wv_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_2_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: !pto.ptr<bf16>, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c4096 = arith.constant 4096 : i64
+  %c4128 = arith.constant 4128 : i64
+  %c5152 = arith.constant 5152 : i64
+  %c6176 = arith.constant 6176 : i64
+  %c7200 = arith.constant 7200 : i64
+  %c9248 = arith.constant 9248 : i64
+  %c9760 = arith.constant 9760 : i64
+  %c11808 = arith.constant 11808 : i64
+  %c12832 = arith.constant 12832 : i64
+  %c13856 = arith.constant 13856 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_2_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aic} -> i32
+  %qwen3_decode_layer_incore_2_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", size = 4096, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer_import : i32)
+  %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x1xf32>
+  pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %8 = arith.muli %arg8, %c8 : index
+    %9 = arith.addi %8, %ob__ci_idx_v0 : index
+    %10 = arith.muli %9, %c1 : index
+    %11 = arith.addi %c0, %10 : index
+    %12 = arith.muli %11, %c64 : index
+    %k_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %v_acc__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%k_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%v_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %13 = arith.muli %kb__idx_v0, %c128 : index
+      %t__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg7, %13], sizes = [%c4, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x128xbf16>
+      pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %x_chunk__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %gamma__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+      pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %2 = pto.alloc_tile addr = %c7200 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcolexpandmul ins(%2, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed_bf16__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed_bf16__tile_nz = pto.alloc_tile addr = %c9760 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%normed_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_bf16__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tpush_to_aic(%normed_bf16__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %3 = pto.alloc_tile addr = %c11808 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %4 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %5 = pto.alloc_tile addr = %c12832 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%1, %4 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %k_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %v_acc__tile_mv = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %6 = pto.alloc_tile addr = %c13856 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%0{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_proj__co_l1_iter_v3_pview = pto.partition_view %k_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
+    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>)
+    %7 = pto.alloc_tile addr = %c13856 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%1{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %v_proj__co_l1_iter_v3_pview = pto.partition_view %v_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
+    pto.tstore ins(%7 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index b8c02ff00..6be43d10e 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -19,7 +19,7 @@ PYTHON_BIN="${PYTHON_BIN:-}"
 PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}"
 PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}"
 PTOAS_FLAGS="${PTOAS_FLAGS:-}"
-PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Scope2}"
+PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Tilelet}"
 ENABLE_BC=0
 
 usage() {
@@ -36,7 +36,7 @@ Env:
   PTOAS_OUT_DIR  # where generated *.mlir/*.cpp go (optional; defaults to a temp dir)
   PTOAS_FLAGS  # extra flags passed to ptoas (e.g. --enable-insert-sync)
   PTOAS_ENABLE_INSERT_SYNC  # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1)
-  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync Qwen3Scope2)
+  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync Qwen3Tilelet)
 
 Flags:
   --enablebc  # enable: python -> .pto -> ptobc -> .pto -> ptoas
@@ -153,10 +153,10 @@ process_one_dir() {
   if [[ "${ENABLE_BC}" == "1" ]]; then
     use_ptobc_roundtrip=1
   fi
-  # Qwen3 scope2 kernels currently serve as direct ptoas compile-regression
+  # Qwen3 tilelet kernels currently serve as direct ptoas compile-regression
   # coverage. They require A5/level3 lowering, but are not expected to
   # roundtrip through ptobc yet.
-  if [[ "$A" == "Qwen3Scope2" ]]; then
+  if [[ "$A" == "Qwen3Tilelet" ]]; then
     use_ptobc_roundtrip=0
   fi
   local -a ptoas_flags=()
@@ -190,7 +190,7 @@ process_one_dir() {
       fi
     done
   fi
-  if [[ "$A" == "Qwen3Scope2" && $has_pto_arch_override -eq 0 ]]; then
+  if [[ "$A" == "Qwen3Tilelet" && $has_pto_arch_override -eq 0 ]]; then
     ptoas_flags+=(--pto-arch a5 --pto-level=level3)
     target_arch="a5"
   fi
@@ -907,6 +907,13 @@ PY
       ptobc_file="${out_subdir}/${base}.ptobc"
       decoded_pto="${out_subdir}/${base}-roundtrip.pto"
       cpp="${out_subdir}/${base}.cpp"
+      if [[ "$A" == "Qwen3Tilelet" ]]; then
+        cpp="${out_subdir}/${base}-pto.cpp"
+      fi
+      if [[ "$A" == "Qwen3Tilelet" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then
+        echo -e "${A}(${base}.pto)\tSKIP\trequires --pto-arch=a5"
+        continue
+      fi
       local sample_use_ptobc_roundtrip="$use_ptobc_roundtrip"
 
       # TODO(ptobc): decode of this regression currently fails with

From bf764cfc5e5eb8fd9320b3ca53d0a795ddd0ae31 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 10:47:06 +0800
Subject: [PATCH 04/16] test: add Qwen3Tilelet board-validation goldens

---
 .github/workflows/ci.yml                      |  13 +-
 .gitignore                                    |   4 +
 .../scripts/generate_testcase.py              | 449 +++++++++++++-----
 test/samples/Qwen3Tilelet/README.md           |   3 +-
 .../qwen3_decode_layer_incore_1/golden.py     |  71 +++
 .../qwen3_decode_layer_incore_10/golden.py    |  68 +++
 .../qwen3_decode_layer_incore_13/golden.py    |  70 +++
 .../qwen3_decode_layer_incore_14/golden.py    |  60 +++
 .../qwen3_decode_layer_incore_2/golden.py     |  81 ++++
 test/samples/validation_runtime.py            |  37 ++
 10 files changed, 736 insertions(+), 120 deletions(-)
 create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py
 create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py
 create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py
 create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py
 create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 01ab24d35..25bf71e2c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,7 +33,7 @@ on:
       skip_cases:
         description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)"
         type: string
-        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12"
+        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp"
       run_only_cases:
         description: "Comma/space separated testcase names to run (empty = run all)"
         type: string
@@ -261,14 +261,7 @@ jobs:
       # Temporary CI gate: skip cases that still error/flap on the remote NPU.
       # Update this list as we fix the underlying issues.
       DEFAULT_SKIP_CASES: >-
-        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,
-        decode_attention_incore_0,decode_attention_incore_1,
-        decode_attention_incore_2,decode_attention_incore_3,
-        decode_attention_incore_4,decode_attention_incore_5,
-        decode_attention_incore_6,decode_attention_incore_7,
-        decode_attention_incore_8,decode_attention_incore_9,
-        decode_attention_incore_10,decode_attention_incore_11,
-        decode_attention_incore_12
+        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp
     steps:
       - name: Resolve validation parameters
         shell: bash
@@ -300,7 +293,7 @@ jobs:
           # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based
           # on SOC_VERSION to keep the remote validation portable.
           A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync"
-          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5"
+          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,qwen3_decode_layer_incore_1,qwen3_decode_layer_incore_2,qwen3_decode_layer_incore_10,qwen3_decode_layer_incore_13,qwen3_decode_layer_incore_14"
 
           sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')"
           is_a5=0
diff --git a/.gitignore b/.gitignore
index 44c61b02a..093b87116 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,6 +64,10 @@ dist/
 /remote_npu_validation_results*.tsv
 /npu_validation/
 test/samples/**/npu_validation/
+!test/samples/Qwen3Tilelet/npu_validation/
+!test/samples/Qwen3Tilelet/npu_validation/**/
+!test/samples/Qwen3Tilelet/npu_validation/**/golden.py
+!test/samples/Qwen3Tilelet/npu_validation/**/compare.py
 /tmp_gen*
 
 # IDE/editor
diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
index ca802b567..f1e3aef77 100644
--- a/test/npu_validation/scripts/generate_testcase.py
+++ b/test/npu_validation/scripts/generate_testcase.py
@@ -85,6 +85,28 @@
     "xor",
 })
 
+CASE_INT_SCALAR_DEFAULTS = {
+    "qwen3_decode_layer_incore_13": {
+        "v7": 64,
+    },
+    "qwen3_decode_layer_incore_14": {
+        "v4": 1,
+        "v5": 64,
+    },
+}
+
+CASE_POINTER_COUNT_MINIMUMS = {
+    "qwen3_decode_layer_incore_13": {
+        "v2": 20480,
+        "v4": 131046528,
+        "v5": 131046528,
+    },
+    "qwen3_decode_layer_incore_14": {
+        "v1": 16384,
+        "v3": 651264,
+    },
+}
+
 
 def _parse_shape(text: str):
     match = re.search(r"Shape<(\d+)\s*,\s*(\d+)>", text)
@@ -96,6 +118,114 @@ def _parse_shape(text: str):
     return 32, 32
 
 
+def _split_params_blob(params_blob: str):
+    params_blob = params_blob.strip()
+    if not params_blob:
+        return []
+    params = []
+    depth = 0
+    start = 0
+    for idx, ch in enumerate(params_blob):
+        if ch == "<":
+            depth += 1
+        elif ch == ">":
+            depth = max(depth - 1, 0)
+        elif ch == "," and depth == 0:
+            params.append(params_blob[start:idx].strip())
+            start = idx + 1
+    last = params_blob[start:].strip()
+    if last:
+        params.append(last)
+    return params
+
+
+def _find_matching_brace(text: str, open_brace_index: int) -> Optional[int]:
+    depth = 0
+    for idx in range(open_brace_index, len(text)):
+        ch = text[idx]
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return idx
+    return None
+
+
+def _extract_aicore_functions(text: str):
+    pattern = re.compile(
+        r"(?P<global>__global__\s+)?AICORE\s+void\s+(?P<name>\w+)\s*\((?P<params>[^)]*)\)\s*\{",
+        re.S,
+    )
+    functions = []
+    for match in pattern.finditer(text):
+        brace_index = text.find("{", match.end("params"))
+        if brace_index < 0:
+            continue
+        end_index = _find_matching_brace(text, brace_index)
+        if end_index is None:
+            continue
+        params_blob = match.group("params").strip()
+        functions.append(
+            {
+                "name": match.group("name"),
+                "params_blob": params_blob,
+                "raw_params": _split_params_blob(params_blob),
+                "is_global": bool(match.group("global")),
+                "text": text[match.start():end_index + 1],
+            }
+        )
+    return functions
+
+
+def _describe_kernel_source(text: str):
+    functions = _extract_aicore_functions(text)
+    for func in functions:
+        if func["is_global"]:
+            return {
+                "kind": "global",
+                "kernel_name": func["name"],
+                "raw_params": func["raw_params"],
+                "analysis_texts": [func["text"]],
+                "writer_texts": [func["text"]],
+                "call_text": func["text"],
+            }
+
+    mixed_groups = {}
+    for func in functions:
+        name = func["name"]
+        for suffix in ("_aic", "_aiv"):
+            if not name.endswith(suffix):
+                continue
+            base = name[: -len(suffix)]
+            group = mixed_groups.setdefault(base, {})
+            group[suffix[1:]] = func
+            break
+
+    for base, group in mixed_groups.items():
+        if "aic" in group and "aiv" in group:
+            params = group["aiv"]["raw_params"] or group["aic"]["raw_params"]
+            return {
+                "kind": "mixed",
+                "kernel_name": base,
+                "raw_params": params,
+                "analysis_texts": [group["aic"]["text"], group["aiv"]["text"]],
+                "writer_texts": [group["aiv"]["text"]],
+                "aic_name": group["aic"]["name"],
+                "aiv_name": group["aiv"]["name"],
+                "call_text": group["aiv"]["text"],
+            }
+
+    return {
+        "kind": "fallback",
+        "kernel_name": "kernel",
+        "raw_params": [],
+        "analysis_texts": [text],
+        "writer_texts": [text],
+        "call_text": text,
+    }
+
+
 def _is_gm_pointer_param(param: str) -> bool:
     return "__gm__" in param and "*" in param
 
@@ -136,6 +266,44 @@ def _strip_param_name(raw: str, name: str) -> str:
     return stripped.strip()
 
 
+def _strip_enclosing_parens(expr: str) -> str:
+    expr = expr.strip()
+    while expr.startswith("(") and expr.endswith(")"):
+        depth = 0
+        ok = True
+        for idx, ch in enumerate(expr):
+            if ch == "(":
+                depth += 1
+            elif ch == ")":
+                depth -= 1
+                if depth == 0 and idx != len(expr) - 1:
+                    ok = False
+                    break
+        if ok and depth == 0:
+            expr = expr[1:-1].strip()
+        else:
+            break
+    return expr
+
+
+def _strip_simple_casts(expr: str) -> str:
+    cur = expr.strip()
+    for _ in range(8):
+        prev = cur
+        cur = _strip_enclosing_parens(cur)
+        match = re.match(r"^(?:reinterpret_cast|static_cast|const_cast|dynamic_cast)\s*<[^>]+>\s*\((.*)\)$", cur, re.S)
+        if match:
+            cur = match.group(1).strip()
+            continue
+        match = re.match(r"^\(\s*[^()]+\s*\)\s*(.+)$", cur, re.S)
+        if match:
+            cur = match.group(1).strip()
+            continue
+        if cur == prev:
+            break
+    return cur
+
+
 def _infer_void_gm_pointee_type(text: str, param_name: str) -> Optional[str]:
     # Common patterns in PTOAS-generated kernels:
     #   __gm__ int16_t* v16 = (__gm__ int16_t*) v1;
@@ -158,56 +326,88 @@ def _infer_void_gm_pointee_type(text: str, param_name: str) -> Optional[str]:
     return None
 
 
-def _detect_output_pointer_param(text: str, pointer_param_names):
-    if not pointer_param_names:
+def _ordered_unique(items):
+    seen = set()
+    out = []
+    for item in items:
+        if item in seen:
+            continue
+        seen.add(item)
+        out.append(item)
+    return out
+
+
+def _resolve_pointer_param_from_expr(expr: str, pointer_param_names, ptr_to_param, ptr_to_base) -> Optional[str]:
+    if not expr:
+        return None
+    cur = _strip_simple_casts(expr)
+    match = re.match(r"^(\w+)\s*\+", cur)
+    if match:
+        cur = match.group(1)
+    elif re.fullmatch(r"[A-Za-z_]\w*", cur):
+        cur = cur
+    else:
         return None
 
+    pointer_params = set(pointer_param_names)
+    seen = set()
+    for _ in range(12):
+        if cur in seen:
+            break
+        seen.add(cur)
+        if cur in pointer_params:
+            return cur
+        mapped = ptr_to_param.get(cur)
+        if mapped:
+            cur = mapped
+            continue
+        mapped = ptr_to_base.get(cur)
+        if mapped:
+            cur = mapped
+            continue
+        break
+    return None
+
+
+def _detect_output_pointer_params(text: str, pointer_param_names):
+    if not pointer_param_names:
+        return []
+
     tstore_gts = re.findall(r"\bTSTORE\s*\(\s*(\w+)\s*,", text)
     if not tstore_gts:
-        return None
+        return []
 
-    gt_to_ptr = {}
-    for m in re.finditer(r"\b(\w+)\s*=\s*[\w:<>]+\s*\(\s*(\w+)\s*[,)]", text):
-        gt_to_ptr[m.group(1)] = m.group(2)
+    gt_to_expr = {}
+    for match in re.finditer(
+        r"\bGlobalTensor<[^;\n]*>\s+(\w+)\s*=\s*GlobalTensor<[^;\n]*>\(([^,]+?)\s*,",
+        text,
+    ):
+        gt_to_expr.setdefault(match.group(1), match.group(2).strip())
+    for match in re.finditer(r"\b(\w+)\s+(\w+)\s*=\s*\1\s*\(([^,]+?)\s*,", text):
+        gt_to_expr.setdefault(match.group(2), match.group(3).strip())
 
     ptr_to_base = {}
-    for m in re.finditer(r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*=\s*(\w+)\s*\+", text):
-        ptr_to_base[m.group(1)] = m.group(2)
+    for match in re.finditer(r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*=\s*(\w+)\s*\+", text):
+        ptr_to_base[match.group(1)] = match.group(2)
+    for match in re.finditer(r"\b(\w+)\s*=\s*(\w+)\s*\+\s*[^;]+;", text):
+        ptr_to_base.setdefault(match.group(1), match.group(2))
 
     ptr_to_param = {}
-    for m in re.finditer(
+    for match in re.finditer(
         r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*=\s*\(__gm__\s+[\w:<>]+\s*\*\)\s*(\w+)\b",
         text,
     ):
-        ptr_to_param[m.group(1)] = m.group(2)
-
-    def resolve_param(ptr: Optional[str]) -> Optional[str]:
-        if not ptr:
-            return None
-        cur = ptr
-        seen = set()
-        for _ in range(8):
-            if cur in seen:
-                break
-            seen.add(cur)
-            if cur in pointer_param_names:
-                return cur
-            mapped = ptr_to_param.get(cur)
-            if mapped in pointer_param_names:
-                return mapped
-            cur = ptr_to_base.get(cur)
-            if cur is None:
-                break
-        return None
+        ptr_to_param[match.group(1)] = match.group(2)
+    for match in re.finditer(r"\b(\w+)\s*=\s*\(__gm__\s+[\w:<>]+\s*\*\)\s*(\w+)\b", text):
+        ptr_to_param.setdefault(match.group(1), match.group(2))
 
+    outputs = []
     for gt in tstore_gts:
-        ptr = gt_to_ptr.get(gt)
-        if not ptr:
-            continue
-        resolved = resolve_param(ptr)
-        if resolved:
-            return resolved
-    return None
+        expr = gt_to_expr.get(gt)
+        param = _resolve_pointer_param_from_expr(expr, pointer_param_names, ptr_to_param, ptr_to_base)
+        if param:
+            outputs.append(param)
+    return _ordered_unique(outputs)
 
 
 def _detect_set_ffts_pointer_params(text: str, pointer_param_names):
@@ -300,24 +500,7 @@ def _parse_kernel_params(text: str):
     match = re.search(r"__global__\s+(?:\w+\s+)*void\s+\w+\s*\(([^)]*)\)", text, re.S)
     if not match:
         return []
-    params_blob = match.group(1).strip()
-    if not params_blob:
-        return []
-    params = []
-    depth = 0
-    start = 0
-    for idx, ch in enumerate(params_blob):
-        if ch == "<":
-            depth += 1
-        elif ch == ">":
-            depth = max(depth - 1, 0)
-        elif ch == "," and depth == 0:
-            params.append(params_blob[start:idx].strip())
-            start = idx + 1
-    last = params_blob[start:].strip()
-    if last:
-        params.append(last)
-    return params
+    return _split_params_blob(match.group(1))
 
 
 def _parse_kernel_name(text: str) -> str:
@@ -367,6 +550,15 @@ def _default_eps_for_cpp_type(cpp_type: str) -> float:
     return 0.0
 
 
+def _integer_scalar_default_value(testcase: str, name: str, host_type: str) -> Optional[int]:
+    override = CASE_INT_SCALAR_DEFAULTS.get(testcase, {}).get(name)
+    if override is not None:
+        return int(override)
+    if re.match(r"^(u?int)(8|16|32|64)_t$", host_type) or host_type in {"int", "unsigned", "size_t"}:
+        return 1
+    return None
+
+
 def _derive_testcase_name(input_cpp: Path) -> str:
     name = input_cpp.stem
     if name.endswith("-pto"):
@@ -670,7 +862,7 @@ def ev(node):
     return ev(parsed)
 
 
-def _infer_int_var_maxima(kernel_text: str) -> dict:
+def _infer_int_var_maxima(kernel_text: str, seed_env: Optional[dict] = None) -> dict:
     """
     Infer max values for simple integer temporaries (e.g. v23) used in pointer
     arithmetic, by evaluating constant-ish assignments and simple for-loop ranges.
@@ -715,7 +907,10 @@ def _infer_int_var_maxima(kernel_text: str) -> dict:
         step = m.group(4).strip()
         loops.append((ind, start, end, step))
 
-    maxima: dict[str, Optional[int]] = {}
+    maxima: dict[str, Optional[int]] = {
+        k: (None if v is None else int(v))
+        for k, v in (seed_env or {}).items()
+    }
 
     def set_max(name: str, value: int) -> bool:
         cur = maxima.get(name)
@@ -760,7 +955,7 @@ def set_max(name: str, value: int) -> bool:
     return {k: (0 if v is None else int(v)) for k, v in maxima.items()}
 
 
-def _infer_gm_pointer_elem_counts(kernel_text: str, pointer_param_names):
+def _infer_gm_pointer_elem_counts(kernel_text: str, pointer_param_names, seed_int_env: Optional[dict] = None):
     """
     Infer minimum element counts for each __gm__ pointer param from GlobalTensor
     shape/stride metadata found in PTOAS-generated kernels.
@@ -774,7 +969,7 @@ def _infer_gm_pointer_elem_counts(kernel_text: str, pointer_param_names):
 
     pointer_params = set(pointer_param_names)
 
-    int_max = _infer_int_var_maxima(kernel_text)
+    int_max = _infer_int_var_maxima(kernel_text, seed_env=seed_int_env)
 
     pointer_like = set(pointer_param_names)
     for m in re.finditer(r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*(?:=[^;]+)?;", kernel_text):
@@ -840,25 +1035,6 @@ def resolve_param_and_offset(ptr: str):
             break
         return None, None
 
-    def strip_enclosing_parens(expr: str) -> str:
-        expr = expr.strip()
-        while expr.startswith("(") and expr.endswith(")"):
-            depth = 0
-            ok = True
-            for i, ch in enumerate(expr):
-                if ch == "(":
-                    depth += 1
-                elif ch == ")":
-                    depth -= 1
-                    if depth == 0 and i != len(expr) - 1:
-                        ok = False
-                        break
-            if ok and depth == 0:
-                expr = expr[1:-1].strip()
-            else:
-                break
-        return expr
-
     def resolve_param_and_offset_expr(ptr_expr: str):
         """
         Resolve a pointer expression passed to GlobalTensor(...) back to a GM
@@ -870,22 +1046,22 @@ def resolve_param_and_offset_expr(ptr_expr: str):
           reinterpret_cast<__gm__ float*>(v1 + expr)
           (__gm__ float*)(v1 + expr)
         """
-        expr = strip_enclosing_parens(ptr_expr.strip())
+        expr = _strip_enclosing_parens(ptr_expr.strip())
         if not expr:
             return None, None
 
         m = re.match(r"^(?:reinterpret_cast|static_cast)<[^>]+>\((.*)\)$", expr)
         if m:
-            expr = strip_enclosing_parens(m.group(1).strip())
+            expr = _strip_enclosing_parens(m.group(1).strip())
 
         # C-style cast prefix: (__gm__ float*)expr / (float*)expr
         m = re.match(r"^\(\s*__gm__[^)]*\)\s*(.+)$", expr)
         if m:
-            expr = strip_enclosing_parens(m.group(1).strip())
+            expr = _strip_enclosing_parens(m.group(1).strip())
         else:
             m = re.match(r"^\(\s*[\w:<> ]+\*\s*\)\s*(.+)$", expr)
             if m:
-                expr = strip_enclosing_parens(m.group(1).strip())
+                expr = _strip_enclosing_parens(m.group(1).strip())
 
         m = re.match(r"^(\w+)\s*\+\s*(.+)$", expr)
         if m:
@@ -990,6 +1166,7 @@ def generate_testcase(
 
     raw_kernel = input_cpp.read_text(encoding="utf-8")
     raw_kernel_for_analysis = raw_kernel
+    kernel_info = _describe_kernel_source(raw_kernel_for_analysis)
     # pto.tcmp / pto.tcmps produce packed predicate masks and leave parts of the
     # logical u8 tile undefined. This can make byte-wise compares flaky.
     has_packed_pred_mask = re.search(r"\bTCMPS?\s*\(", raw_kernel_for_analysis) is not None
@@ -997,12 +1174,20 @@ def generate_testcase(
     has_dav_vec = "__DAV_VEC__" in raw_kernel
     has_intra_block_sync = "set_intra_block(" in raw_kernel or "wait_intra_block(" in raw_kernel
 
+    is_mixed_kernel = kernel_info["kind"] == "mixed"
+
     if aicore_arch is None:
+        if is_mixed_kernel:
+            sv = (soc_version or "").lower()
+            if "950" in sv or "a5" in sv or "910b" in sv:
+                aicore_arch = "dav-c310"
+            else:
+                aicore_arch = "dav-c220"
         # Sectioned kernels contain `#if defined(__DAV_CUBE__)` / `__DAV_VEC__`
         # blocks. For inter-core-style mixed kernels (with intra-block sync),
         # align to PTO-ISA mix-kernel compile mode (`dav-c310`) so the
         # toolchain owns DAV macro definition.
-        if has_dav_cube and has_dav_vec and has_intra_block_sync:
+        elif has_dav_cube and has_dav_vec and has_intra_block_sync:
             sv = (soc_version or "").lower()
             if "950" in sv or "a5" in sv:
                 aicore_arch = "dav-c310"
@@ -1028,16 +1213,16 @@ def generate_testcase(
     # For mix-kernel arch (dav-c310/dav-c220), do not force-define macros.
     dav_defines = ""
     is_mix_arch = aicore_arch in {"dav-c310", "dav-c220"}
-    if not (is_mix_arch and has_dav_cube and has_dav_vec and has_intra_block_sync):
+    if not is_mix_arch:
         if has_dav_cube:
             dav_defines += " -D__DAV_CUBE__"
         if has_dav_vec:
             dav_defines += " -D__DAV_VEC__"
 
-    rows, cols = _parse_shape(raw_kernel_for_analysis)
+    rows, cols = _parse_shape(kernel_info["call_text"])
     logical_elem_count = rows * cols
-    kernel_name = _parse_kernel_name(raw_kernel_for_analysis)
-    raw_params = _parse_kernel_params(raw_kernel_for_analysis)
+    kernel_name = kernel_info["kernel_name"]
+    raw_params = kernel_info["raw_params"]
     mrgsort_block_len = _infer_mrgsort_block_len(raw_kernel_for_analysis) if "TMRGSORT" in raw_kernel_for_analysis else None
 
     pointer_param_names = [_extract_cpp_name(p) for p in raw_params if _is_gm_pointer_param(p)]
@@ -1055,13 +1240,17 @@ def generate_testcase(
     ffts_param_names = _detect_set_ffts_pointer_params(raw_kernel_for_analysis, pointer_param_names)
     non_ffts_pointer_param_names = [n for n in pointer_param_names if n not in ffts_param_names]
 
-    output_ptr = _detect_output_pointer_param(raw_kernel_for_analysis, non_ffts_pointer_param_names)
-    if output_ptr is None and non_ffts_pointer_param_names:
-        output_ptr = (
+    output_param_names = []
+    for writer_text in kernel_info["writer_texts"]:
+        output_param_names.extend(_detect_output_pointer_params(writer_text, non_ffts_pointer_param_names))
+    output_param_names = _ordered_unique(output_param_names)
+    if not output_param_names and non_ffts_pointer_param_names:
+        output_param_names = [
             non_ffts_pointer_param_names[0]
             if len(non_ffts_pointer_param_names) == 1
             else non_ffts_pointer_param_names[-1]
-        )
+        ]
+    output_param_name_set = set(output_param_names)
 
     params = []
     for raw in raw_params:
@@ -1080,7 +1269,7 @@ def generate_testcase(
                     "role": (
                         "ffts"
                         if name in ffts_param_names
-                        else ("output" if name == output_ptr else "input")
+                        else ("output" if name in output_param_name_set else "input")
                     ),
                 }
             )
@@ -1106,7 +1295,20 @@ def generate_testcase(
     init_ptrs = list(data_ptrs)
     output_ptrs = [p for p in data_ptrs if p["role"] == "output"]
 
-    inferred_counts = _infer_gm_pointer_elem_counts(raw_kernel_for_analysis, pointer_param_names)
+    scalar_int_defaults = {
+        p["name"]: default_value
+        for p in params
+        if p["kind"] == "scalar"
+        for default_value in [_integer_scalar_default_value(testcase, p["name"], p["host_type"])]
+        if default_value is not None
+    }
+    inferred_counts = {}
+    for analysis_text in kernel_info["analysis_texts"]:
+        partial_counts = _infer_gm_pointer_elem_counts(analysis_text, pointer_param_names, seed_int_env=scalar_int_defaults)
+        for name, count in partial_counts.items():
+            inferred_counts[name] = max(inferred_counts.get(name, 0), count)
+    for name, count in CASE_POINTER_COUNT_MINIMUMS.get(testcase, {}).items():
+        inferred_counts[name] = max(inferred_counts.get(name, 0), int(count))
     ptr_elem_counts = {}
     for p in data_ptrs:
         inferred = inferred_counts.get(p["name"])
@@ -1153,7 +1355,7 @@ def generate_testcase(
         if t == "bool":
             value = "true"
         elif re.match(r"^(u?int)(8|16|32|64)_t$", t) or t in {"int", "unsigned", "size_t"}:
-            value = "1"
+            value = str(_integer_scalar_default_value(testcase, p["name"], t) or 1)
         elif t in {"float"}:
             value = "1.0f"
         elif t in {"double"}:
@@ -1429,22 +1631,51 @@ def generate_testcase(
     kernel_call_args_host = ", ".join(kernel_call_args_host)
     raw_params_host = [_rewrite_host_unsupported_types(p) for p in raw_params]
     launch_block_count = _infer_launch_block_count(raw_kernel_for_analysis, testcase)
-    launch_cpp = (
-        INCLUDE_REPLACEMENT
-        + "\n"
-        "#if defined(__CCE_AICORE__)\n"
-        f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n"
-        "#else\n"
-        f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n"
-        "#endif\n\n"
-        f"void {launch_name}({launch_fn_params}) {{\n"
-        "#if defined(__CCE_AICORE__)\n"
-        f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n"
-        "#else\n"
-        f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n"
-        "#endif\n"
-        f"}}\n"
-    )
+    if is_mixed_kernel:
+        wrapper_call_args = ", ".join([p["name"] for p in params])
+        launch_cpp = (
+            INCLUDE_REPLACEMENT
+            + "\n"
+            "#if defined(__CCE_AICORE__)\n"
+            f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params)});\n"
+            f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params)});\n"
+            f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n"
+            f"    {kernel_info['aic_name']}({wrapper_call_args});\n"
+            f"    {kernel_info['aiv_name']}({wrapper_call_args});\n"
+            "}\n"
+            "#else\n"
+            f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params_host)});\n"
+            f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params_host)});\n"
+            f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)}) {{\n"
+            f"    {kernel_info['aic_name']}({wrapper_call_args});\n"
+            f"    {kernel_info['aiv_name']}({wrapper_call_args});\n"
+            "}\n"
+            "#endif\n\n"
+            f"void {launch_name}({launch_fn_params}) {{\n"
+            "#if defined(__CCE_AICORE__)\n"
+            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n"
+            "#else\n"
+            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n"
+            "#endif\n"
+            f"}}\n"
+        )
+    else:
+        launch_cpp = (
+            INCLUDE_REPLACEMENT
+            + "\n"
+            "#if defined(__CCE_AICORE__)\n"
+            f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n"
+            "#else\n"
+            f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n"
+            "#endif\n\n"
+            f"void {launch_name}({launch_fn_params}) {{\n"
+            "#if defined(__CCE_AICORE__)\n"
+            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n"
+            "#else\n"
+            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n"
+            "#endif\n"
+            f"}}\n"
+        )
     (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8")
 
     # pto-isa selects instruction implementations based on MEMORY_BASE vs
diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
index 34e3a51bf..9132ac959 100644
--- a/test/samples/Qwen3Tilelet/README.md
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -10,4 +10,5 @@ Notes:
   `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`,
   `qwen3_decode_layer_incore_14`.
 - This sample directory vendors only those direct `ptoas` regression inputs.
-- No custom `golden.py` or `compare.py` is included here: these grouped mixed kernels depend on orchestration-managed peer buffers and loop-carried context, so per-kernel numerical validation is not a drop-in replacement for the full PyPTO runtime flow.
+- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation.
+- Custom `golden.py` assets cover these 5 cases as standalone mixed-kernel regression tests on A5.
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py
new file mode 100644
index 000000000..5e8eeadbe
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray:
+    if positive:
+        return generator.uniform(0.5, 1.5, size=count).astype(np.float32)
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    meta = load_case_meta()
+    generator = rng()
+    b0, ob = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05),
+        "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True),
+        "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True),
+        "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]),
+        "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.05),
+    }
+
+    inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1)
+    output = np.zeros_like(buffers["v4"])
+
+    for ob_ci in range(4):
+        q0 = (ob * 4 + ob_ci) * 64
+        acc = np.zeros((4, 64), dtype=np.float32)
+        for kb in range(40):
+            k0 = kb * 128
+            x_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
+            )
+            gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
+            w_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120)
+            )
+            acc += (x_chunk * inv_rms * gamma) @ w_chunk
+        output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v4": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py
new file mode 100644
index 000000000..19bcac622
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    meta = load_case_meta()
+    generator = rng()
+    b0, ob = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.05),
+        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.05),
+        "v3": np.zeros(meta.elem_counts["v3"], dtype=meta.np_types["v3"]),
+        "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.05),
+    }
+
+    output = np.zeros_like(buffers["v3"])
+
+    for ob_ci in range(8):
+        o0 = (ob * 8 + ob_ci) * 64
+        acc = np.zeros((4, 64), dtype=np.float32)
+        for kb in range(40):
+            k0 = kb * 128
+            attn_chunk = load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
+            attn_chunk = bf16_to_float32(float32_to_bf16(attn_chunk))
+            w_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v4"], offset=k0 * 5120 + o0, rows=128, cols=64, row_stride=5120)
+            )
+            acc += attn_chunk @ w_chunk
+        resid = bf16_to_float32(
+            load_strided_2d(buffers["v2"], offset=b0 * 5120 + o0, rows=4, cols=64, row_stride=5120)
+        )
+        output = store_strided_2d(output, acc + resid, offset=o0, row_stride=5120)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v3": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py
new file mode 100644
index 000000000..2fe6818ae
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    meta = load_case_meta()
+    generator = rng()
+    o0 = load_int32_assignments()[0]
+
+    buffers = {
+        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01),
+        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01),
+        "v3": make_fp32(generator, meta.elem_counts["v3"], scale=0.01),
+        "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.01),
+        "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.01),
+        "v6": np.zeros(meta.elem_counts["v6"], dtype=meta.np_types["v6"]),
+    }
+
+    gate_acc = np.zeros((4, 64), dtype=np.float32)
+    up_acc = np.zeros((4, 64), dtype=np.float32)
+
+    for kb in range(40):
+        k0 = kb * 128
+        post_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=k0, rows=4, cols=128, row_stride=5120))
+        w_gate = bf16_to_float32(
+            load_strided_2d(buffers["v4"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600)
+        )
+        w_up = bf16_to_float32(
+            load_strided_2d(buffers["v5"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600)
+        )
+        gate_acc += post_chunk @ w_gate
+        up_acc += post_chunk @ w_up
+
+    sigmoid = np.reciprocal(1.0 + np.exp(-gate_acc))
+    mlp_chunk = gate_acc * sigmoid * up_acc
+    output = float32_to_bf16(mlp_chunk)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v6": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py
new file mode 100644
index 000000000..d913d746e
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    meta = load_case_meta()
+    generator = rng()
+    dob, o0 = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01),
+        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01),
+        "v3": make_bf16(generator, meta.elem_counts["v3"], scale=0.01),
+    }
+
+    output = np.array(buffers["v1"], copy=True)
+    mlp_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=0, rows=4, cols=64, row_stride=64))
+
+    for dob_ci in range(4):
+        d0 = (dob * 4 + dob_ci) * 128
+        down_prev = load_strided_2d(output, offset=d0, rows=4, cols=128, row_stride=5120).astype(np.float32)
+        w_down = bf16_to_float32(
+            load_strided_2d(buffers["v3"], offset=o0 * 5120 + d0, rows=64, cols=128, row_stride=5120)
+        )
+        output = store_strided_2d(output, down_prev + mlp_chunk @ w_down, offset=d0, row_stride=5120)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v1": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py
new file mode 100644
index 000000000..59a46b188
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray:
+    if positive:
+        return generator.uniform(0.5, 1.5, size=count).astype(np.float32)
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    meta = load_case_meta()
+    generator = rng()
+    b0, ob = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05),
+        "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True),
+        "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True),
+        "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]),
+        "v5": np.zeros(meta.elem_counts["v5"], dtype=meta.np_types["v5"]),
+        "v6": make_bf16(generator, meta.elem_counts["v6"], scale=0.05),
+        "v7": make_bf16(generator, meta.elem_counts["v7"], scale=0.05),
+    }
+
+    inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1)
+    k_proj = np.zeros_like(buffers["v4"])
+    v_proj = np.zeros_like(buffers["v5"])
+
+    for ob_ci in range(8):
+        kv0 = (ob * 8 + ob_ci) * 64
+        k_acc = np.zeros((4, 64), dtype=np.float32)
+        v_acc = np.zeros((4, 64), dtype=np.float32)
+        for kb in range(40):
+            k0 = kb * 128
+            x_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
+            )
+            gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
+            normed = x_chunk * inv_rms * gamma
+            wk_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
+            )
+            wv_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v7"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
+            )
+            k_acc += normed @ wk_chunk
+            v_acc += normed @ wv_chunk
+        k_proj = store_strided_2d(k_proj, float32_to_bf16(k_acc), offset=b0 * 1024 + kv0, row_stride=1024)
+        v_proj = store_strided_2d(v_proj, float32_to_bf16(v_acc), offset=b0 * 1024 + kv0, row_stride=1024)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v4": k_proj, "v5": v_proj})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py
index fdeedfd05..247020a8d 100644
--- a/test/samples/validation_runtime.py
+++ b/test/samples/validation_runtime.py
@@ -89,6 +89,43 @@ def is_a5_soc() -> bool:
     return '950' in sv or 'a5' in sv or '910_95' in sv
 
 
+def bf16_to_float32(values: np.ndarray) -> np.ndarray:
+    values_u16 = np.asarray(values, dtype=np.uint16)
+    return (values_u16.astype(np.uint32) << 16).view(np.float32)
+
+
+def float32_to_bf16(values: np.ndarray) -> np.ndarray:
+    values_f32 = np.asarray(values, dtype=np.float32)
+    bits = values_f32.view(np.uint32)
+    round_bias = np.uint32(0x7FFF) + ((bits >> 16) & np.uint32(1))
+    return ((bits + round_bias) >> 16).astype(np.uint16)
+
+
+def load_strided_2d(buffer, *, offset: int, rows: int, cols: int, row_stride: int) -> np.ndarray:
+    flat = np.asarray(buffer).reshape(-1)
+    tile = np.empty((rows, cols), dtype=flat.dtype)
+    for row in range(rows):
+        start = offset + row * row_stride
+        stop = start + cols
+        if stop > flat.size:
+            raise ValueError(f'strided load out of bounds: [{start}:{stop}] > {flat.size}')
+        tile[row, :] = flat[start:stop]
+    return tile
+
+
+def store_strided_2d(buffer, tile, *, offset: int, row_stride: int):
+    flat = np.asarray(buffer).reshape(-1)
+    tile_arr = np.asarray(tile)
+    rows, cols = tile_arr.shape
+    for row in range(rows):
+        start = offset + row * row_stride
+        stop = start + cols
+        if stop > flat.size:
+            raise ValueError(f'strided store out of bounds: [{start}:{stop}] > {flat.size}')
+        flat[start:stop] = tile_arr[row]
+    return flat
+
+
 def float_values(generator, count: int, *, style: str) -> np.ndarray:
     if style == 'signed':
         values = generator.uniform(-3.0, 3.0, size=count).astype(np.float32)

From 5fad1616b399f86ed0264a1deb51d8aa5e01307a Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 11:04:16 +0800
Subject: [PATCH 05/16] test: align Qwen3Tilelet golden layout

---
 .gitignore                                                    | 4 ----
 test/samples/Qwen3Tilelet/README.md                           | 2 +-
 .../golden.py => qwen3_decode_layer_incore_10_golden.py}      | 0
 .../golden.py => qwen3_decode_layer_incore_13_golden.py}      | 0
 .../golden.py => qwen3_decode_layer_incore_14_golden.py}      | 0
 .../golden.py => qwen3_decode_layer_incore_1_golden.py}       | 0
 .../golden.py => qwen3_decode_layer_incore_2_golden.py}       | 0
 7 files changed, 1 insertion(+), 5 deletions(-)
 rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_10/golden.py => qwen3_decode_layer_incore_10_golden.py} (100%)
 rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_13/golden.py => qwen3_decode_layer_incore_13_golden.py} (100%)
 rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_14/golden.py => qwen3_decode_layer_incore_14_golden.py} (100%)
 rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_1/golden.py => qwen3_decode_layer_incore_1_golden.py} (100%)
 rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_2/golden.py => qwen3_decode_layer_incore_2_golden.py} (100%)

diff --git a/.gitignore b/.gitignore
index 093b87116..44c61b02a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,10 +64,6 @@ dist/
 /remote_npu_validation_results*.tsv
 /npu_validation/
 test/samples/**/npu_validation/
-!test/samples/Qwen3Tilelet/npu_validation/
-!test/samples/Qwen3Tilelet/npu_validation/**/
-!test/samples/Qwen3Tilelet/npu_validation/**/golden.py
-!test/samples/Qwen3Tilelet/npu_validation/**/compare.py
 /tmp_gen*
 
 # IDE/editor
diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
index 9132ac959..3298e0b97 100644
--- a/test/samples/Qwen3Tilelet/README.md
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -11,4 +11,4 @@ Notes:
   `qwen3_decode_layer_incore_14`.
 - This sample directory vendors only those direct `ptoas` regression inputs.
 - `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation.
-- Custom `golden.py` assets cover these 5 cases as standalone mixed-kernel regression tests on A5.
+- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `<case>_golden.py`.
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
similarity index 100%
rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py
rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
similarity index 100%
rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py
rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
similarity index 100%
rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py
rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
similarity index 100%
rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py
rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
similarity index 100%
rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py
rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py

From ddd9941f83b6f4bb436715360c5dc1dd1e7be12c Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 11:09:38 +0800
Subject: [PATCH 06/16] chore: add missing PR386 license headers

---
 .github/scripts/compute_ptoas_version.py | 7 +++++++
 test/samples/validation_runtime.py       | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/.github/scripts/compute_ptoas_version.py b/.github/scripts/compute_ptoas_version.py
index 85205b6b8..92f4bd627 100644
--- a/.github/scripts/compute_ptoas_version.py
+++ b/.github/scripts/compute_ptoas_version.py
@@ -1,4 +1,11 @@
 #!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
 
 import argparse
 import pathlib
diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py
index 247020a8d..b97f8a861 100644
--- a/test/samples/validation_runtime.py
+++ b/test/samples/validation_runtime.py
@@ -1,4 +1,12 @@
 #!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
 import os
 import re
 import sys

From 7fe8bcf543bbd2ac35d80e8b6cbbf2d152179c4a Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 15:11:28 +0800
Subject: [PATCH 07/16] test: model bf16 requantize in Qwen3 goldens

---
 .../Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py     | 7 ++++++-
 .../Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py     | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
index 5e8eeadbe..3a7d64a75 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
@@ -32,6 +32,10 @@ def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
     return float32_to_bf16(make_fp32(generator, count, scale=scale))
 
 
+def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray:
+    return bf16_to_float32(float32_to_bf16(values))
+
+
 def main():
     meta = load_case_meta()
     generator = rng()
@@ -57,10 +61,11 @@ def main():
                 load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
             )
             gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
+            normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma)
             w_chunk = bf16_to_float32(
                 load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120)
             )
-            acc += (x_chunk * inv_rms * gamma) @ w_chunk
+            acc += normed @ w_chunk
         output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120)
 
     write_buffers(meta, buffers)
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
index 59a46b188..347ca7c0a 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
@@ -32,6 +32,10 @@ def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
     return float32_to_bf16(make_fp32(generator, count, scale=scale))
 
 
+def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray:
+    return bf16_to_float32(float32_to_bf16(values))
+
+
 def main():
     meta = load_case_meta()
     generator = rng()
@@ -61,7 +65,7 @@ def main():
                 load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
             )
             gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
-            normed = x_chunk * inv_rms * gamma
+            normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma)
             wk_chunk = bf16_to_float32(
                 load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
             )

From 1183a9298643801024de61f190482399238f398f Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 16:38:16 +0800
Subject: [PATCH 08/16] test: regenerate qwen3 tilelet PTO inputs with M16

---
 test/samples/Qwen3Tilelet/README.md           |  12 +-
 .../qwen3_decode_layer_incore_0.pto           |  23 ++
 .../qwen3_decode_layer_incore_1.pto           | 121 ++---------
 .../qwen3_decode_layer_incore_10.pto          | 122 ++---------
 .../qwen3_decode_layer_incore_10_golden.py    |  68 ------
 .../qwen3_decode_layer_incore_11.pto          | 118 +++++++++++
 .../qwen3_decode_layer_incore_12.pto          |  31 +++
 .../qwen3_decode_layer_incore_13.pto          | 121 ++---------
 .../qwen3_decode_layer_incore_13_golden.py    |  70 -------
 .../qwen3_decode_layer_incore_14.pto          | 128 +++++++-----
 .../qwen3_decode_layer_incore_14_golden.py    |  60 ------
 .../qwen3_decode_layer_incore_15.pto          |  47 +++++
 .../qwen3_decode_layer_incore_16.pto          |  49 +++++
 .../qwen3_decode_layer_incore_17.pto          | 104 ++++++++++
 .../qwen3_decode_layer_incore_18.pto          |  75 +++++++
 .../qwen3_decode_layer_incore_19.pto          |  36 ++++
 .../qwen3_decode_layer_incore_1_golden.py     |  76 -------
 .../qwen3_decode_layer_incore_2.pto           | 196 ++++++------------
 .../qwen3_decode_layer_incore_2_golden.py     |  85 --------
 .../qwen3_decode_layer_incore_3.pto           |  45 ++++
 .../qwen3_decode_layer_incore_4.pto           |  46 ++++
 .../qwen3_decode_layer_incore_5.pto           |  46 ++++
 .../qwen3_decode_layer_incore_6.pto           |  88 ++++++++
 .../qwen3_decode_layer_incore_7.pto           |  92 ++++++++
 .../qwen3_decode_layer_incore_8.pto           |  30 +++
 .../qwen3_decode_layer_incore_9.pto           |  49 +++++
 26 files changed, 1063 insertions(+), 875 deletions(-)
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto

diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
index 3298e0b97..4f78ed37f 100644
--- a/test/samples/Qwen3Tilelet/README.md
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -1,14 +1,10 @@
 Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_tilelet.py`.
 
 Scope:
-- compile-regression inputs for `ptoas`
+- direct `ptoas` compile-regression inputs
 - A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS`
 
 Notes:
-- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs:
-  `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`,
-  `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`,
-  `qwen3_decode_layer_incore_14`.
-- This sample directory vendors only those direct `ptoas` regression inputs.
-- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation.
-- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `<case>_golden.py`.
+- The current tilelet lowering emits 20 kernel fragments (`aiv`, `aic`, and mixed-kernel `.pto` files). This directory vendors those emitted `.pto` inputs directly, flattened into one sample directory for `runop.sh`.
+- These files are regenerated from the tilelet example with `BATCH_TILE=16` / M=16 lowering.
+- The directory is compile-regression focused; stale custom NPU-validation goldens for the old M=4 split are intentionally dropped here.
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
new file mode 100644
index 000000000..856f60659
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
@@ -0,0 +1,23 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_0(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %attn_out__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %zero_q__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%zero_q__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %zero_attn__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%zero_q__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%zero_attn__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_proj__iter_v1_pview = pto.partition_view %q_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%zero_q__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
+  %attn_out__iter_v1_pview = pto.partition_view %attn_out__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tstore ins(%zero_attn__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_out__iter_v1_pview : !pto.partition_tensor_view<16x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
index 77b0b5c33..2d0902b60 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
@@ -1,116 +1,23 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_1_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c4096 = arith.constant 4096 : i64
+  func.func @qwen3_decode_layer_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
   %c0i = arith.constant 0 : i64
-  %c16 = arith.constant 16 : index
-  %c5120 = arith.constant 5120 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c40 = arith.constant 40 : index
-      %c128 = arith.constant 128 : index
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_1_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", size = 4096, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
-  %qwen3_decode_layer_incore_1_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer : i32)
-  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %0 = arith.muli %arg6, %c4 : index
-    %1 = arith.addi %0, %ob__ci_idx_v0 : index
-    %2 = arith.muli %1, %c1 : index
-    %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c64 : index
-    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-      %5 = arith.muli %kb__idx_v0, %c128 : index
-      %wq_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-      pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wq_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %t__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %t__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%t__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%t__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      pto.tfree_from_aiv {split = 0}
-      %wq_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-      pto.tmov ins(%wq_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wq_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-      pto.tmatmul ins(%t__tile_Left, %wq_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-    }
-  }
-  return
-  }
-  func.func @qwen3_decode_layer_incore_1_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
   %c4096 = arith.constant 4096 : i64
-  %c4128 = arith.constant 4128 : i64
-  %c5152 = arith.constant 5152 : i64
-  %c6176 = arith.constant 6176 : i64
-  %c8224 = arith.constant 8224 : i64
-  %c8736 = arith.constant 8736 : i64
-  %c10784 = arith.constant 10784 : i64
-  %c11808 = arith.constant 11808 : i64
   %c16 = arith.constant 16 : index
-  %c5120 = arith.constant 5120 : index
+  %c1024 = arith.constant 1024 : index
   %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c64 = arith.constant 64 : index
   %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %cst = arith.constant 0.000000e+00 : f32
-    %c40 = arith.constant 40 : index
-      %c128 = arith.constant 128 : index
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_1_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aic} -> i32
-  %qwen3_decode_layer_incore_1_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", size = 4096, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer_import : i32)
-  %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x1xf32>
-  pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %5 = arith.muli %arg6, %c4 : index
-    %6 = arith.addi %5, %ob__ci_idx_v0 : index
-    %7 = arith.muli %6, %c1 : index
-    %8 = arith.addi %c0, %7 : index
-    %9 = arith.muli %8, %c64 : index
-    %q_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmuls ins(%q_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-      %10 = arith.muli %kb__idx_v0, %c128 : index
-      %t__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg5, %10], sizes = [%c4, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x128xbf16>
-      pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %x_chunk__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %gamma__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %10], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-      pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %1 = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %normed__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcolexpandmul ins(%1, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %2 = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %t__tile_nz = pto.alloc_tile addr = %c8736 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      pto.tpush_to_aic(%t__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
-      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %3 = pto.alloc_tile addr = %c10784 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      pto.tfree_from_aic {split = 0}
-      %q_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    }
-    %4 = pto.alloc_tile addr = %c11808 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%0{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %q_proj__co_l1_iter_v3_pview = pto.partition_view %q_proj__co_l0_iter_v3_view, offsets = [%arg5, %9], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
-    pto.tstore ins(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>)
-  }
+  %k_proj__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %zero_k__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%zero_k__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %zero_v__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%zero_v__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_proj__iter_v1_pview = pto.partition_view %k_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%zero_k__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
+  %v_proj__iter_v1_pview = pto.partition_view %v_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%zero_v__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
   return
   }
 }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
index 636b81393..bc49f96e4 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
@@ -1,108 +1,30 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_10_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c4096 = arith.constant 4096 : i64
+  func.func @qwen3_decode_layer_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
   %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
   %c16 = arith.constant 16 : index
-  %c5120 = arith.constant 5120 : index
+  %c64 = arith.constant 64 : index
   %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c64 = arith.constant 64 : index
-    %c40 = arith.constant 40 : index
-      %c128 = arith.constant 128 : index
-  %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_10_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", size = 4096, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
-  %qwen3_decode_layer_incore_10_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer : i32)
-  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
-    %0 = arith.muli %arg5, %c8 : index
-    %1 = arith.addi %0, %ob__ci_idx_v0 : index
-    %2 = arith.muli %1, %c1 : index
-    %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c64 : index
-    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-      %5 = arith.muli %kb__idx_v0, %c128 : index
-      %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-      pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %a_chunk__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%a_chunk__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%a_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      pto.tfree_from_aiv {split = 0}
-      %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-      pto.tmov ins(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-      pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-    }
-  }
-  return
-  }
-  func.func @qwen3_decode_layer_incore_10_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c4096 = arith.constant 4096 : i64
-  %c5120 = arith.constant 5120 : i64
-  %c7168 = arith.constant 7168 : i64
-  %c8192 = arith.constant 8192 : i64
-  %c10240 = arith.constant 10240 : i64
-  %c11264 = arith.constant 11264 : i64
-  %c9216 = arith.constant 9216 : i64
-  %c16 = arith.constant 16 : index
-  %4 = arith.constant 5120 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c64 = arith.constant 64 : index
-    %cst = arith.constant 0.000000e+00 : f32
-    %c40 = arith.constant 40 : index
-      %c128 = arith.constant 128 : index
-  %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%4, %4], strides = [%4, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_10_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aic} -> i32
-  %qwen3_decode_layer_incore_10_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", size = 4096, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer_import : i32)
-  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
-    %5 = arith.muli %arg5, %c8 : index
-    %6 = arith.addi %5, %ob__ci_idx_v0 : index
-    %7 = arith.muli %6, %c1 : index
-    %8 = arith.addi %c0, %7 : index
-    %9 = arith.muli %8, %c64 : index
-    %o_acc__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmuls ins(%o_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-      %10 = arith.muli %kb__idx_v0, %c128 : index
-      %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %attn_out__rv_v2_pview = pto.partition_view %attn_out__rv_v2_view, offsets = [%arg4, %10], sizes = [%c4, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x128xf32>
-      pto.tload ins(%attn_out__rv_v2_pview : !pto.partition_tensor_view<4x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %a_chunk__tile = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%a_chunk__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %a_chunk__tile_nz = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%a_chunk__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%a_chunk__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      pto.tpush_to_aic(%a_chunk__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
-      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %1 = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      pto.tfree_from_aic {split = 0}
-      %o_acc__tile_mv = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tmov ins(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    }
-    %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %9], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
-    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %resid__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%2{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %3 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%0, %resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %resid1_tile__co_l1_iter_v1_pview = pto.partition_view %resid1_tile__co_l0_iter_v1_view, offsets = [%c0, %9], sizes = [%c4, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x64xf32>
-    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__co_l1_iter_v1_pview : !pto.partition_tensor_view<4x64xf32>)
-  }
+  %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+  pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
   return
   }
 }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
deleted file mode 100644
index 19bcac622..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/python3
-# Copyright (c) 2026 Huawei Technologies Co., Ltd.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-
-import numpy as np
-
-from validation_runtime import (
-    bf16_to_float32,
-    float32_to_bf16,
-    load_case_meta,
-    load_int32_assignments,
-    load_strided_2d,
-    rng,
-    store_strided_2d,
-    write_buffers,
-    write_golden,
-)
-
-
-def make_fp32(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
-    return generator.uniform(-scale, scale, size=count).astype(np.float32)
-
-
-def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
-    return float32_to_bf16(make_fp32(generator, count, scale=scale))
-
-
-def main():
-    meta = load_case_meta()
-    generator = rng()
-    b0, ob = load_int32_assignments()[:2]
-
-    buffers = {
-        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.05),
-        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.05),
-        "v3": np.zeros(meta.elem_counts["v3"], dtype=meta.np_types["v3"]),
-        "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.05),
-    }
-
-    output = np.zeros_like(buffers["v3"])
-
-    for ob_ci in range(8):
-        o0 = (ob * 8 + ob_ci) * 64
-        acc = np.zeros((4, 64), dtype=np.float32)
-        for kb in range(40):
-            k0 = kb * 128
-            attn_chunk = load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
-            attn_chunk = bf16_to_float32(float32_to_bf16(attn_chunk))
-            w_chunk = bf16_to_float32(
-                load_strided_2d(buffers["v4"], offset=k0 * 5120 + o0, rows=128, cols=64, row_stride=5120)
-            )
-            acc += attn_chunk @ w_chunk
-        resid = bf16_to_float32(
-            load_strided_2d(buffers["v2"], offset=b0 * 5120 + o0, rows=4, cols=64, row_stride=5120)
-        )
-        output = store_strided_2d(output, acc + resid, offset=o0, row_stride=5120)
-
-    write_buffers(meta, buffers)
-    write_golden(meta, {"v3": output})
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
new file mode 100644
index 000000000..9a8a29a01
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
@@ -0,0 +1,118 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_11(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<f32>, %arg7: !pto.ptr<f32>, %arg8: !pto.ptr<f32>, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c64 = arith.constant 64 : i64
+  %c96 = arith.constant 96 : i64
+  %c128 = arith.constant 128 : i64
+  %c4224 = arith.constant 4224 : i64
+  %c8320 = arith.constant 8320 : i64
+  %c12416 = arith.constant 12416 : i64
+  %c12448 = arith.constant 12448 : i64
+  %c12480 = arith.constant 12480 : i64
+  %c12512 = arith.constant 12512 : i64
+  %c12544 = arith.constant 12544 : i64
+  %c12576 = arith.constant 12576 : i64
+  %c12608 = arith.constant 12608 : i64
+  %c8 = arith.constant 8 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c12416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c12448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %arg9, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c12416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c12448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
new file mode 100644
index 000000000..a9c4f9bee
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
@@ -0,0 +1,31 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_12(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c4128 = arith.constant 4128 : i64
+  %c1 = arith.constant 1 : index
+  %c8192 = arith.constant 8192 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c1024 = arith.constant 1024 : index
+  %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %ctx_flat_bf16__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%ctx_flat__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx_flat_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = arith.muli %arg3, %c128 : index
+  %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x1024xbf16>
+  pto.tstore ins(%ctx_flat_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
index 6eedee90d..8b38aaf7e 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
@@ -1,116 +1,21 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_13_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  func.func @qwen3_decode_layer_incore_13(%arg0: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
   %c0i = arith.constant 0 : i64
-  %c1024 = arith.constant 1024 : i64
-  %c17408 = arith.constant 17408 : i64
-  %c4 = arith.constant 4 : index
-  %c64 = arith.constant 64 : index
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
   %c1 = arith.constant 1 : index
-  %c5120 = arith.constant 5120 : index
-  %c25600 = arith.constant 25600 : index
-  %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
-  %c40 = arith.constant 40 : index
-    %c128 = arith.constant 128 : index
-  %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_13_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_13_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-    %1 = arith.muli %kb__idx_v0, %c128 : index
-    %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c4, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x128xbf16>
-    pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %wg__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %wu__tile = pto.alloc_tile addr = %c17408 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%post_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wg__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-    %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wu__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-    pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+  %resid1_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  scf.for %ob__idx_v0 = %c0 to %c128 step %c1 {
+    %0 = arith.muli %ob__idx_v0, %c64 : index
+    %zero_resid1__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texpands ins(%cst : f32) outs(%zero_resid1__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid1_tile__iter_v1_pview = pto.partition_view %resid1_tile__ssa_v0_view, offsets = [%c0, %0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+    pto.tstore ins(%zero_resid1__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
   }
   return
   }
-  func.func @qwen3_decode_layer_incore_13_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c8192 = arith.constant 8192 : i64
-  %c9216 = arith.constant 9216 : i64
-  %c11264 = arith.constant 11264 : i64
-  %c12288 = arith.constant 12288 : i64
-  %c10240 = arith.constant 10240 : i64
-  %c13312 = arith.constant 13312 : i64
-  %c4 = arith.constant 4 : index
-  %c64 = arith.constant 64 : index
-  %c1 = arith.constant 1 : index
-  %c5120 = arith.constant 5120 : index
-  %c25600 = arith.constant 25600 : index
-  %c0_i32 = arith.constant 0 : i32
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c40 = arith.constant 40 : index
-  %cst_1 = arith.constant 1.000000e+00 : f32
-  %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_13_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", size = 8192, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
-  %gate_acc__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %gate_acc__ssa_v0_pview = pto.partition_view %gate_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x64xf32>
-  pto.tload ins(%gate_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %up_acc__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %up_acc__ssa_v0_pview = pto.partition_view %up_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x64xf32>
-  pto.tload ins(%up_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmuls ins(%gate_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %1 = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmuls ins(%up_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tfree_from_aic {split = 0}
-    %3 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %4 = pto.alloc_tile addr = %c12288 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%1, %3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tfree_from_aic {split = 0}
-    %gate_acc__tile_mv = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gate_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %up_acc__tile_mv = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%up_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  }
-  %t__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tneg ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %5 = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %6 = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tadds ins(%5, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sigmoid__tile = pto.alloc_tile addr = %c11264 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.trecip ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %7 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmul ins(%0, %sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %mlp_chunk__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmul ins(%7, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c13312 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcvt ins(%mlp_chunk__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
-  pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<4x64xbf16>)
-  return
-  }
 }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
deleted file mode 100644
index 2fe6818ae..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/python3
-# Copyright (c) 2026 Huawei Technologies Co., Ltd.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-
-import numpy as np
-
-from validation_runtime import (
-    bf16_to_float32,
-    float32_to_bf16,
-    load_case_meta,
-    load_int32_assignments,
-    load_strided_2d,
-    rng,
-    write_buffers,
-    write_golden,
-)
-
-
-def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
-    return generator.uniform(-scale, scale, size=count).astype(np.float32)
-
-
-def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
-    return float32_to_bf16(make_fp32(generator, count, scale=scale))
-
-
-def main():
-    meta = load_case_meta()
-    generator = rng()
-    o0 = load_int32_assignments()[0]
-
-    buffers = {
-        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01),
-        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01),
-        "v3": make_fp32(generator, meta.elem_counts["v3"], scale=0.01),
-        "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.01),
-        "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.01),
-        "v6": np.zeros(meta.elem_counts["v6"], dtype=meta.np_types["v6"]),
-    }
-
-    gate_acc = np.zeros((4, 64), dtype=np.float32)
-    up_acc = np.zeros((4, 64), dtype=np.float32)
-
-    for kb in range(40):
-        k0 = kb * 128
-        post_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=k0, rows=4, cols=128, row_stride=5120))
-        w_gate = bf16_to_float32(
-            load_strided_2d(buffers["v4"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600)
-        )
-        w_up = bf16_to_float32(
-            load_strided_2d(buffers["v5"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600)
-        )
-        gate_acc += post_chunk @ w_gate
-        up_acc += post_chunk @ w_up
-
-    sigmoid = np.reciprocal(1.0 + np.exp(-gate_acc))
-    mlp_chunk = gate_acc * sigmoid * up_acc
-    output = float32_to_bf16(mlp_chunk)
-
-    write_buffers(meta, buffers)
-    write_golden(meta, {"v6": output})
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
index 725a529d2..0a0172824 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
@@ -1,72 +1,90 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
   %c0i = arith.constant 0 : i64
-  %c16384 = arith.constant 16384 : i64
-  %c4 = arith.constant 4 : index
-  %c5120 = arith.constant 5120 : index
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
   %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c25600 = arith.constant 25600 : index
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+  %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
   %qwen3_decode_layer_incore_14_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_14_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %0 = arith.muli %arg3, %c4 : index
-    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %0 = arith.muli %arg5, %c8 : index
+    %1 = arith.addi %0, %ob__ci_idx_v0 : index
     %2 = arith.muli %1, %c1 : index
     %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c128 : index
-    %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
-    pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
-    pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_down_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    %4 = arith.muli %3, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
+      %5 = arith.muli %kb__idx_v0, %c128 : index
+      %a_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %attn_out__rv_v5_pview = pto.partition_view %attn_out__rv_v5_view, offsets = [%arg4, %5], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+      pto.tload ins(%attn_out__rv_v5_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%a_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%a_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%a_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
   }
   return
   }
-  func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c16384 = arith.constant 16384 : i64
-  %c4 = arith.constant 4 : index
-  %c5120 = arith.constant 5120 : index
+  func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c32768 = arith.constant 32768 : i64
+  %c40960 = arith.constant 40960 : i64
+  %c45056 = arith.constant 45056 : i64
+  %c36864 = arith.constant 36864 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
   %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c25600 = arith.constant 25600 : index
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 16384, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %0 = arith.muli %arg3, %c4 : index
-    %1 = arith.addi %0, %dob__ci_idx_v0 : index
-    %2 = arith.muli %1, %c1 : index
-    %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c128 : index
-    %down_prev__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x128xf32>
-    pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<4x128xf32>) outs(%down_prev__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %down_next__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tfree_from_aic {split = 0}
-    %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x128xf32>
-    pto.tstore ins(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.partition_tensor_view<4x128xf32>)
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+  %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 32768, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %2 = arith.muli %arg5, %c8 : index
+    %3 = arith.addi %2, %ob__ci_idx_v0 : index
+    %4 = arith.muli %3, %c1 : index
+    %5 = arith.addi %c0, %4 : index
+    %6 = arith.muli %5, %c64 : index
+    %o_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texpands ins(%cst : f32) outs(%o_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %0 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%o_acc__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %o_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %t__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %6], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%o_acc__tile, %resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid1_tile__co_l1_iter_v4_pview = pto.partition_view %resid1_tile__co_l0_iter_v4_view, offsets = [%c0, %6], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__co_l1_iter_v4_pview : !pto.partition_tensor_view<16x64xf32>)
   }
   return
   }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
deleted file mode 100644
index d913d746e..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/python3
-# Copyright (c) 2026 Huawei Technologies Co., Ltd.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-
-import numpy as np
-
-from validation_runtime import (
-    bf16_to_float32,
-    float32_to_bf16,
-    load_case_meta,
-    load_int32_assignments,
-    load_strided_2d,
-    rng,
-    store_strided_2d,
-    write_buffers,
-    write_golden,
-)
-
-
-def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
-    return generator.uniform(-scale, scale, size=count).astype(np.float32)
-
-
-def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
-    return float32_to_bf16(make_fp32(generator, count, scale=scale))
-
-
-def main():
-    meta = load_case_meta()
-    generator = rng()
-    dob, o0 = load_int32_assignments()[:2]
-
-    buffers = {
-        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01),
-        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01),
-        "v3": make_bf16(generator, meta.elem_counts["v3"], scale=0.01),
-    }
-
-    output = np.array(buffers["v1"], copy=True)
-    mlp_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=0, rows=4, cols=64, row_stride=64))
-
-    for dob_ci in range(4):
-        d0 = (dob * 4 + dob_ci) * 128
-        down_prev = load_strided_2d(output, offset=d0, rows=4, cols=128, row_stride=5120).astype(np.float32)
-        w_down = bf16_to_float32(
-            load_strided_2d(buffers["v3"], offset=o0 * 5120 + d0, rows=64, cols=128, row_stride=5120)
-        )
-        output = store_strided_2d(output, down_prev + mlp_chunk @ w_down, offset=d0, row_stride=5120)
-
-    write_buffers(meta, buffers)
-    write_golden(meta, {"v1": output})
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
new file mode 100644
index 000000000..a45c9a509
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
@@ -0,0 +1,47 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_15(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c8256 = arith.constant 8256 : i64
+  %c16448 = arith.constant 16448 : i64
+  %c16512 = arith.constant 16512 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %5 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+  %cst_1 = arith.constant 1.220703e-04 : f32
+  %cst_2 = arith.constant 1.000000e-06 : f32
+  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sq_sum__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%sq_sum__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %5 step %c1 {
+    %6 = arith.muli %kb__idx_v0, %c128 : index
+    %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %6], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %tmp_tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowsum ins(%t__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %2 = pto.alloc_tile addr = %c16512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%sq_sum__tile, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %sq_sum__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sq_sum__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%sq_sum__tile, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%3, %cst_2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trsqrt ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+  pto.tstore ins(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<1x16xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
new file mode 100644
index 000000000..f9fa660d1
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
@@ -0,0 +1,49 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_16(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c8256 = arith.constant 8256 : i64
+  %c8768 = arith.constant 8768 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %2 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %cst = arith.constant 0.000000e+00 : f32
+  %down_proj_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %post_norm_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %post_rms_weight__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg4, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %inv_rms__ssa_v0_pview = pto.partition_view %inv_rms__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+  pto.tload ins(%inv_rms__ssa_v0_pview : !pto.partition_tensor_view<1x16xf32>) outs(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %zi__idx_v0 = %c0 to %2 step %c1 {
+    %3 = arith.muli %zi__idx_v0, %c128 : index
+    %down_zero_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texpands ins(%cst : f32) outs(%down_zero_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %down_proj_tile__iter_v1_pview = pto.partition_view %down_proj_tile__ssa_v0_view, offsets = [%c0, %3], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tstore ins(%down_zero_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_proj_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>)
+  }
+  scf.for %kb__idx_v0 = %c0 to %2 step %c1 {
+    %4 = arith.muli %kb__idx_v0, %c128 : index
+    %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %gamma__tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %post_rms_weight__ssa_v0_pview = pto.partition_view %post_rms_weight__ssa_v0_view, offsets = [%c0, %4], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%post_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%x_chunk__tile, %t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %normed__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%0, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c8768 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %post_norm_tile__iter_v1_pview = pto.partition_view %post_norm_tile__ssa_v0_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%post_norm_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
new file mode 100644
index 000000000..ae6570c56
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
@@ -0,0 +1,104 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_17_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c20480 = arith.constant 20480 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c25600 = arith.constant 25600 : index
+  %c64 = arith.constant 64 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_17_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_17_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%post_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wg__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wu__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_17_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c32768 = arith.constant 32768 : i64
+  %c36864 = arith.constant 36864 : i64
+  %c45056 = arith.constant 45056 : i64
+  %c49152 = arith.constant 49152 : i64
+  %c40960 = arith.constant 40960 : i64
+  %c53248 = arith.constant 53248 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c25600 = arith.constant 25600 : index
+  %c64 = arith.constant 64 : index
+  %c0_i32 = arith.constant 0 : i32
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_17_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", size = 32768, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%gate_acc__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %1 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %2 = pto.alloc_tile addr = %c49152 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%up_acc__tile, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gate_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%up_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tneg ins(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %3 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %4 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%3, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trecip ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %5 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%gate_acc__tile, %sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%5, %up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%mlp_chunk__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
new file mode 100644
index 000000000..3228a9f80
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
@@ -0,0 +1,75 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_18_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_18_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_18_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+    pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_down_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_18_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c65536 = arith.constant 65536 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_18_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", size = 65536, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tstore ins(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.partition_tensor_view<16x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
new file mode 100644
index 000000000..776c7aed2
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
@@ -0,0 +1,36 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_19(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %2 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %out__co_l0_iter_v3_view = pto.make_tensor_view %arg1, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %3 = arith.muli %arg4, %c4 : index
+    %4 = arith.addi %3, %ob__ci_idx_v0 : index
+    %5 = arith.muli %4, %c1 : index
+    %6 = arith.addi %c0, %5 : index
+    %7 = arith.muli %6, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_proj_tile__rv_v5_pview = pto.partition_view %down_proj_tile__rv_v5_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%down_proj_tile__rv_v5_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %down_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%down_acc__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %out__co_l1_iter_v3_pview = pto.partition_view %out__co_l0_iter_v3_view, offsets = [%arg3, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
deleted file mode 100644
index 3a7d64a75..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/python3
-# Copyright (c) 2026 Huawei Technologies Co., Ltd.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-
-import numpy as np
-
-from validation_runtime import (
-    bf16_to_float32,
-    float32_to_bf16,
-    load_case_meta,
-    load_int32_assignments,
-    load_strided_2d,
-    rng,
-    store_strided_2d,
-    write_buffers,
-    write_golden,
-)
-
-
-def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray:
-    if positive:
-        return generator.uniform(0.5, 1.5, size=count).astype(np.float32)
-    return generator.uniform(-scale, scale, size=count).astype(np.float32)
-
-
-def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
-    return float32_to_bf16(make_fp32(generator, count, scale=scale))
-
-
-def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray:
-    return bf16_to_float32(float32_to_bf16(values))
-
-
-def main():
-    meta = load_case_meta()
-    generator = rng()
-    b0, ob = load_int32_assignments()[:2]
-
-    buffers = {
-        "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05),
-        "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True),
-        "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True),
-        "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]),
-        "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.05),
-    }
-
-    inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1)
-    output = np.zeros_like(buffers["v4"])
-
-    for ob_ci in range(4):
-        q0 = (ob * 4 + ob_ci) * 64
-        acc = np.zeros((4, 64), dtype=np.float32)
-        for kb in range(40):
-            k0 = kb * 128
-            x_chunk = bf16_to_float32(
-                load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
-            )
-            gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
-            normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma)
-            w_chunk = bf16_to_float32(
-                load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120)
-            )
-            acc += normed @ w_chunk
-        output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120)
-
-    write_buffers(meta, buffers)
-    write_golden(meta, {"v4": output})
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
index dc6456847..9fbf4425d 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
@@ -1,147 +1,67 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_2_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: !pto.ptr<bf16>, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c4096 = arith.constant 4096 : i64
-  %c20480 = arith.constant 20480 : i64
+  func.func @qwen3_decode_layer_incore_2(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
   %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c4160 = arith.constant 4160 : i64
+  %c12352 = arith.constant 12352 : i64
+  %c20544 = arith.constant 20544 : i64
+  %c20608 = arith.constant 20608 : i64
+  %c20672 = arith.constant 20672 : i64
   %c16 = arith.constant 16 : index
-  %c5120 = arith.constant 5120 : index
+  %c8192 = arith.constant 8192 : index
   %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c1024 = arith.constant 1024 : index
+  %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c64 = arith.constant 64 : index
-    %c40 = arith.constant 40 : index
-      %c128 = arith.constant 128 : index
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_2_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", size = 4096, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
-  %qwen3_decode_layer_incore_2_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer : i32)
-  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
-    %1 = arith.muli %arg8, %c8 : index
-    %2 = arith.addi %1, %ob__ci_idx_v0 : index
-    %3 = arith.muli %2, %c1 : index
-    %4 = arith.addi %c0, %3 : index
-    %5 = arith.muli %4, %c64 : index
-    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-      %6 = arith.muli %kb__idx_v0, %c128 : index
-      %wk_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-      pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wk_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %wv_chunk__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-      pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wv_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %normed_bf16__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %normed_bf16__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%normed_bf16__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%normed_bf16__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      pto.tfree_from_aiv {split = 0}
-      %wk_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-      pto.tmov ins(%wk_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wk_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-      pto.tmatmul ins(%normed_bf16__tile_Left, %wk_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-      %wv_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-      pto.tmov ins(%wv_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wv_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-      %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-      pto.tmatmul ins(%normed_bf16__tile_Left, %wv_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-      pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-    }
+  %10 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+  %cst_1 = arith.constant 1.220703e-04 : f32
+  %cst_2 = arith.constant 1.000000e-06 : f32
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %normed_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %partial_sq__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%partial_sq__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %10 step %c1 {
+    %11 = arith.muli %kb__idx_v0, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %11], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %x_chunk__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %tmp_tile = pto.alloc_tile addr = %c12352 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %1 = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowsum ins(%0, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %3 = pto.alloc_tile addr = %c20608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%partial_sq__tile, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %partial_sq__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%partial_sq__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
   }
-  return
-  }
-  func.func @qwen3_decode_layer_incore_2_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: !pto.ptr<bf16>, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c4096 = arith.constant 4096 : i64
-  %c4128 = arith.constant 4128 : i64
-  %c5152 = arith.constant 5152 : i64
-  %c6176 = arith.constant 6176 : i64
-  %c7200 = arith.constant 7200 : i64
-  %c9248 = arith.constant 9248 : i64
-  %c9760 = arith.constant 9760 : i64
-  %c11808 = arith.constant 11808 : i64
-  %c12832 = arith.constant 12832 : i64
-  %c13856 = arith.constant 13856 : i64
-  %c16 = arith.constant 16 : index
-  %c5120 = arith.constant 5120 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c64 = arith.constant 64 : index
-    %cst = arith.constant 0.000000e+00 : f32
-    %c40 = arith.constant 40 : index
-      %c128 = arith.constant 128 : index
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_2_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aic} -> i32
-  %qwen3_decode_layer_incore_2_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", size = 4096, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer_import : i32)
-  %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<4x1xf32>
-  pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
-    %8 = arith.muli %arg8, %c8 : index
-    %9 = arith.addi %8, %ob__ci_idx_v0 : index
-    %10 = arith.muli %9, %c1 : index
-    %11 = arith.addi %c0, %10 : index
-    %12 = arith.muli %11, %c64 : index
-    %k_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %v_acc__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmuls ins(%k_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmuls ins(%v_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
-      %13 = arith.muli %kb__idx_v0, %c128 : index
-      %t__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg7, %13], sizes = [%c4, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x128xbf16>
-      pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %x_chunk__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %gamma__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-      pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %2 = pto.alloc_tile addr = %c7200 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=1, v_row=4, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %normed__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcolexpandmul ins(%2, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %normed_bf16__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %normed_bf16__tile_nz = pto.alloc_tile addr = %c9760 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%normed_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_bf16__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      pto.tpush_to_aic(%normed_bf16__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=128, v_row=4, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
-      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %3 = pto.alloc_tile addr = %c11808 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      pto.tfree_from_aic {split = 0}
-      %4 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %5 = pto.alloc_tile addr = %c12832 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tadd ins(%1, %4 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      pto.tfree_from_aic {split = 0}
-      %k_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      %v_acc__tile_mv = pto.alloc_tile addr = %c5152 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tmov ins(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    }
-    %6 = pto.alloc_tile addr = %c13856 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%0{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %k_proj__co_l1_iter_v3_pview = pto.partition_view %k_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
-    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>)
-    %7 = pto.alloc_tile addr = %c13856 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%1{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %v_proj__co_l1_iter_v3_pview = pto.partition_view %v_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<4x64xbf16>
-    pto.tstore ins(%7 : !pto.tile_buf<loc=vec, dtype=bf16, rows=4, cols=64, v_row=4, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>)
+  %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%partial_sq__tile, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%4, %cst_2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %variance__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  scf.for %12 = %c0 to %10 step %c1 {
+    %13 = arith.muli %12, %c128 : index
+    %6 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %14 = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %13], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%14 : !pto.partition_tensor_view<16x128xbf16>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %7 = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%6{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %gamma__tile = pto.alloc_tile addr = %c20672 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %8 = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%7, %variance__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%8 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %normed__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%8, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %normed_tile__iter_v1_pview = pto.partition_view %normed_tile__ssa_v0_view, offsets = [%c0, %13], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tstore ins(%9 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>)
   }
   return
   }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
deleted file mode 100644
index 347ca7c0a..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/python3
-# Copyright (c) 2026 Huawei Technologies Co., Ltd.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-
-import numpy as np
-
-from validation_runtime import (
-    bf16_to_float32,
-    float32_to_bf16,
-    load_case_meta,
-    load_int32_assignments,
-    load_strided_2d,
-    rng,
-    store_strided_2d,
-    write_buffers,
-    write_golden,
-)
-
-
-def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray:
-    if positive:
-        return generator.uniform(0.5, 1.5, size=count).astype(np.float32)
-    return generator.uniform(-scale, scale, size=count).astype(np.float32)
-
-
-def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
-    return float32_to_bf16(make_fp32(generator, count, scale=scale))
-
-
-def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray:
-    return bf16_to_float32(float32_to_bf16(values))
-
-
-def main():
-    meta = load_case_meta()
-    generator = rng()
-    b0, ob = load_int32_assignments()[:2]
-
-    buffers = {
-        "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05),
-        "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True),
-        "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True),
-        "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]),
-        "v5": np.zeros(meta.elem_counts["v5"], dtype=meta.np_types["v5"]),
-        "v6": make_bf16(generator, meta.elem_counts["v6"], scale=0.05),
-        "v7": make_bf16(generator, meta.elem_counts["v7"], scale=0.05),
-    }
-
-    inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1)
-    k_proj = np.zeros_like(buffers["v4"])
-    v_proj = np.zeros_like(buffers["v5"])
-
-    for ob_ci in range(8):
-        kv0 = (ob * 8 + ob_ci) * 64
-        k_acc = np.zeros((4, 64), dtype=np.float32)
-        v_acc = np.zeros((4, 64), dtype=np.float32)
-        for kb in range(40):
-            k0 = kb * 128
-            x_chunk = bf16_to_float32(
-                load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120)
-            )
-            gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
-            normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma)
-            wk_chunk = bf16_to_float32(
-                load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
-            )
-            wv_chunk = bf16_to_float32(
-                load_strided_2d(buffers["v7"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
-            )
-            k_acc += normed @ wk_chunk
-            v_acc += normed @ wv_chunk
-        k_proj = store_strided_2d(k_proj, float32_to_bf16(k_acc), offset=b0 * 1024 + kv0, row_stride=1024)
-        v_proj = store_strided_2d(v_proj, float32_to_bf16(v_acc), offset=b0 * 1024 + kv0, row_stride=1024)
-
-    write_buffers(meta, buffers)
-    write_golden(meta, {"v4": k_proj, "v5": v_proj})
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
new file mode 100644
index 000000000..f8bccc6ac
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
@@ -0,0 +1,45 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_3(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wq__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_b__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_b__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_b__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_b__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %q_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%tile_a__tile_Left, %tile_b__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%q_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_b_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %3 = pto.partition_view %wq__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_b_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_b_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_b_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_b_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  }
+  %q_proj__iter_v6_pview = pto.partition_view %q_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%q_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%q_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
new file mode 100644
index 000000000..9a2756f1c
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
@@ -0,0 +1,46 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_4(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c1 = arith.constant 1 : index
+  %c8192 = arith.constant 8192 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %k_proj__iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %normed_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wk__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_wk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %k_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%tile_a__tile_Left, %tile_wk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wk_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %3 = pto.partition_view %wk__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wk_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_wk_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wk_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wk_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  }
+  %k_proj__iter_v6_pview = pto.partition_view %k_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%k_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%k_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
new file mode 100644
index 000000000..db88c9a68
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
@@ -0,0 +1,46 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_5(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wv__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wv__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wv__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_wv__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wv__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %v_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%tile_a__tile_Left, %tile_wv__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%v_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wv_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %3 = pto.partition_view %wv__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wv_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_wv_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wv_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wv_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  }
+  %v_proj__iter_v6_pview = pto.partition_view %v_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%v_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%v_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
new file mode 100644
index 000000000..4443956bc
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
@@ -0,0 +1,88 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_6(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<bf16>, %arg7: !pto.ptr<f32>, %arg8: index, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c1280 = arith.constant 1280 : i64
+  %c1536 = arith.constant 1536 : i64
+  %c1792 = arith.constant 1792 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c2176 = arith.constant 2176 : i64
+  %c2688 = arith.constant 2688 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %7 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c4096 = arith.constant 4096 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %k_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_cache__iter_v1_view = pto.make_tensor_view %arg6, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__rv_v5_view = pto.make_tensor_view %arg7, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %8 = arith.muli %ki__idx_v0, %c128 : index
+    %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_proj__rv_v5_pview = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %8], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%k_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %10 = arith.addi %8, %c64 : index
+    %9 = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %10], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%9 : !pto.partition_tensor_view<1x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_lo__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %11 = arith.muli %arg8, %c8 : index
+    %12 = arith.muli %11, %c4096 : index
+    %13 = arith.muli %ki__idx_v0, %c4096 : index
+    %14 = arith.addi %12, %13 : index
+    %15 = arith.addi %14, %arg9 : index
+    %3 = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%rot_lo__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x64xbf16>)
+    %4 = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%rot_hi__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_cache__tile_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c64], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__tile_pview : !pto.partition_tensor_view<1x64xbf16>)
+    %5 = pto.alloc_tile addr = %c2176 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %17 = arith.muli %ki__idx_v0, %c128 : index
+    %v_proj__rv_v5_pview = pto.partition_view %v_proj__rv_v5_view, offsets = [%arg8, %17], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%v_proj__rv_v5_pview : !pto.partition_tensor_view<1x128xf32>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c2688 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%5{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
new file mode 100644
index 000000000..2f80eb162
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
@@ -0,0 +1,92 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_7(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<f32>, %arg7: !pto.ptr<f32>, %arg8: !pto.ptr<f32>, %arg9: index, %arg10: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c1280 = arith.constant 1280 : i64
+  %c1536 = arith.constant 1536 : i64
+  %c1792 = arith.constant 1792 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c2176 = arith.constant 2176 : i64
+  %c2304 = arith.constant 2304 : i64
+  %c6400 = arith.constant 6400 : i64
+  %c6432 = arith.constant 6432 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c16 = arith.constant 16 : index
+  %c128 = arith.constant 128 : index
+  %c8192 = arith.constant 8192 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_padded__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %qi__idx_v0 = %c0 to %c8 step %c1 {
+    %5 = arith.addi %arg10, %qi__idx_v0 : index
+    %6 = arith.muli %5, %c128 : index
+    %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %q_proj__rv_v5_pview = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %6], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%q_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%q_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %8 = arith.addi %6, %c64 : index
+    %7 = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %8], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%7 : !pto.partition_tensor_view<1x64xf32>) outs(%q_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_lo_bf16__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%1{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %4 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%2, %3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_hi_bf16__tile = pto.alloc_tile addr = %c2176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%4{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_padded__iter_v1_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__iter_v1_pview : !pto.partition_tensor_view<1x64xbf16>)
+    %q_padded__tile_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c64], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__tile_pview : !pto.partition_tensor_view<1x64xbf16>)
+  }
+  %oi__tile = pto.alloc_tile addr = %c2304 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li_flat__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi_flat__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tstore ins(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
new file mode 100644
index 000000000..53988ea99
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_8(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xbf16>
+  %q_padded__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %q_padded__rv_v2_pview = pto.partition_view %q_padded__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%q_padded__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto
new file mode 100644
index 000000000..eb677daf6
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto
@@ -0,0 +1,49 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_9(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c8224 = arith.constant 8224 : i64
+  %c9248 = arith.constant 9248 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 8.838835e-02 : f32
+  %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%scores_padded__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, f32) outs(%scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_fp32__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  return
+  }
+}

From fa55d253f0ff7899ba4f44eda6512dd8ad6453cf Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 16:51:50 +0800
Subject: [PATCH 09/16] test: restore qwen3 tilelet M16 goldens

---
 test/samples/Qwen3Tilelet/README.md           |  12 +-
 .../qwen3_decode_layer_incore_0.pto           |  23 ---
 .../qwen3_decode_layer_incore_1.pto           | 123 +++++++++--
 .../qwen3_decode_layer_incore_10.pto          | 118 +++++++++--
 .../qwen3_decode_layer_incore_10_golden.py    |  69 +++++++
 .../qwen3_decode_layer_incore_11.pto          | 118 -----------
 .../qwen3_decode_layer_incore_12.pto          |  31 ---
 .../qwen3_decode_layer_incore_13.pto          | 119 +++++++++--
 .../qwen3_decode_layer_incore_13_golden.py    |  73 +++++++
 .../qwen3_decode_layer_incore_14.pto          | 126 +++++-------
 .../qwen3_decode_layer_incore_14_golden.py    |  61 ++++++
 .../qwen3_decode_layer_incore_15.pto          |  47 -----
 .../qwen3_decode_layer_incore_16.pto          |  49 -----
 .../qwen3_decode_layer_incore_17.pto          | 104 ----------
 .../qwen3_decode_layer_incore_18.pto          |  75 -------
 .../qwen3_decode_layer_incore_19.pto          |  36 ----
 .../qwen3_decode_layer_incore_1_golden.py     |  77 +++++++
 .../qwen3_decode_layer_incore_2.pto           | 194 ++++++++++++------
 .../qwen3_decode_layer_incore_2_golden.py     |  86 ++++++++
 .../qwen3_decode_layer_incore_3.pto           |  45 ----
 .../qwen3_decode_layer_incore_4.pto           |  46 -----
 .../qwen3_decode_layer_incore_5.pto           |  46 -----
 .../qwen3_decode_layer_incore_6.pto           |  88 --------
 .../qwen3_decode_layer_incore_7.pto           |  92 ---------
 .../qwen3_decode_layer_incore_8.pto           |  30 ---
 .../qwen3_decode_layer_incore_9.pto           |  49 -----
 26 files changed, 877 insertions(+), 1060 deletions(-)
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
 delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto

diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
index 4f78ed37f..010e75623 100644
--- a/test/samples/Qwen3Tilelet/README.md
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -1,10 +1,14 @@
 Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_tilelet.py`.
 
 Scope:
-- direct `ptoas` compile-regression inputs
+- compile-regression inputs for `ptoas`
 - A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS`
 
 Notes:
-- The current tilelet lowering emits 20 kernel fragments (`aiv`, `aic`, and mixed-kernel `.pto` files). This directory vendors those emitted `.pto` inputs directly, flattened into one sample directory for `runop.sh`.
-- These files are regenerated from the tilelet example with `BATCH_TILE=16` / M=16 lowering.
-- The directory is compile-regression focused; stale custom NPU-validation goldens for the old M=4 split are intentionally dropped here.
+- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs:
+  `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`,
+  `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`,
+  `qwen3_decode_layer_incore_14`.
+- This sample directory vendors only those direct `ptoas` regression inputs, regenerated from the tilelet source with `BATCH_TILE=16`.
+- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation.
+- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `<case>_golden.py`.
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
deleted file mode 100644
index 856f60659..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
+++ /dev/null
@@ -1,23 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_0(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c64 = arith.constant 64 : index
-  %c0 = arith.constant 0 : index
-  %attn_out__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %q_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %zero_q__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%zero_q__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %zero_attn__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcvt ins(%zero_q__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%zero_attn__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %q_proj__iter_v1_pview = pto.partition_view %q_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%zero_q__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
-  %attn_out__iter_v1_pview = pto.partition_view %attn_out__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
-  pto.tstore ins(%zero_attn__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_out__iter_v1_pview : !pto.partition_tensor_view<16x64xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
index 2d0902b60..591063f0d 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto
@@ -1,23 +1,116 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @qwen3_decode_layer_incore_1_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c16384 = arith.constant 16384 : i64
   %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
   %c16 = arith.constant 16 : index
-  %c1024 = arith.constant 1024 : index
+  %c5120 = arith.constant 5120 : index
   %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c64 = arith.constant 64 : index
   %c0 = arith.constant 0 : index
-  %k_proj__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %v_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %zero_k__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%zero_k__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %zero_v__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%zero_v__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %k_proj__iter_v1_pview = pto.partition_view %k_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%zero_k__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
-  %v_proj__iter_v1_pview = pto.partition_view %v_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%zero_v__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
+  %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_1_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", size = 16384, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
+  %qwen3_decode_layer_incore_1_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg6, %c4 : index
+    %1 = arith.addi %0, %ob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %5 = arith.muli %kb__idx_v0, %c128 : index
+      %wq_chunk__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wq_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %t__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %t__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%t__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%t__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tfree_from_aiv {split = 0}
+      %wq_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%wq_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wq_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%t__tile_Left, %wq_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_1_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c16384 = arith.constant 16384 : i64
+  %c16448 = arith.constant 16448 : i64
+  %c20544 = arith.constant 20544 : i64
+  %c24640 = arith.constant 24640 : i64
+  %c32832 = arith.constant 32832 : i64
+  %c33344 = arith.constant 33344 : i64
+  %c41536 = arith.constant 41536 : i64
+  %c45632 = arith.constant 45632 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_1_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aic} -> i32
+  %qwen3_decode_layer_incore_1_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", size = 16384, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer_import : i32)
+  %inv_rms_tile__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %5 = arith.muli %arg6, %c4 : index
+    %6 = arith.addi %5, %ob__ci_idx_v0 : index
+    %7 = arith.muli %6, %c1 : index
+    %8 = arith.addi %c0, %7 : index
+    %9 = arith.muli %8, %c64 : index
+    %q_acc__tile = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%q_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %10 = arith.muli %kb__idx_v0, %c128 : index
+      %t__tile = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg5, %10], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+      pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %x_chunk__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %gamma__tile = pto.alloc_tile addr = %c32832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %10], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+      pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %1 = pto.alloc_tile addr = %c24640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcolexpandmul ins(%1, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %2 = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %t__tile_nz = pto.alloc_tile addr = %c33344 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tpush_to_aic(%t__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %3 = pto.alloc_tile addr = %c41536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %q_acc__tile_mv = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %4 = pto.alloc_tile addr = %c45632 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%0{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_proj__co_l1_iter_v3_pview = pto.partition_view %q_proj__co_l0_iter_v3_view, offsets = [%arg5, %9], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tstore ins(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x64xbf16>)
+  }
   return
   }
 }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
index bc49f96e4..9e7c5e51a 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto
@@ -1,30 +1,106 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  func.func @qwen3_decode_layer_incore_10_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c16384 = arith.constant 16384 : i64
   %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_10_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", size = 16384, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
+  %qwen3_decode_layer_incore_10_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %0 = arith.muli %arg5, %c8 : index
+    %1 = arith.addi %0, %ob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %5 = arith.muli %kb__idx_v0, %c128 : index
+      %w_chunk__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %a_chunk__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%a_chunk__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%a_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tfree_from_aiv {split = 0}
+      %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_10_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
   %c16384 = arith.constant 16384 : i64
+  %c20480 = arith.constant 20480 : i64
+  %c28672 = arith.constant 28672 : i64
+  %c32768 = arith.constant 32768 : i64
+  %c40960 = arith.constant 40960 : i64
+  %c45056 = arith.constant 45056 : i64
+  %c36864 = arith.constant 36864 : i64
   %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
+  %c5120 = arith.constant 5120 : index
   %c1 = arith.constant 1 : index
-  %c524288 = arith.constant 524288 : index
-  %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
-  %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
-  pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
-  pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-  pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_10_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aic} -> i32
+  %qwen3_decode_layer_incore_10_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", size = 16384, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer_import : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %4 = arith.muli %arg5, %c8 : index
+    %5 = arith.addi %4, %ob__ci_idx_v0 : index
+    %6 = arith.muli %5, %c1 : index
+    %7 = arith.addi %c0, %6 : index
+    %8 = arith.muli %7, %c64 : index
+    %o_acc__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%o_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %9 = arith.muli %kb__idx_v0, %c128 : index
+      %t__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %attn_out__rv_v2_pview = pto.partition_view %attn_out__rv_v2_view, offsets = [%arg4, %9], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+      pto.tload ins(%attn_out__rv_v2_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %a_chunk__tile = pto.alloc_tile addr = %c28672 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%a_chunk__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %a_chunk__tile_nz = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%a_chunk__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%a_chunk__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tpush_to_aic(%a_chunk__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %1 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %o_acc__tile_mv = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %2 = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %8], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%2 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%2{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%0, %resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %resid1_tile__co_l1_iter_v1_pview = pto.partition_view %resid1_tile__co_l0_iter_v1_view, offsets = [%c0, %8], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__co_l1_iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
+  }
   return
   }
 }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
new file mode 100644
index 000000000..c286795b8
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py
@@ -0,0 +1,69 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    rows = 16
+    meta = load_case_meta()
+    generator = rng()
+    b0, ob = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.05),
+        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.05),
+        "v3": np.zeros(meta.elem_counts["v3"], dtype=meta.np_types["v3"]),
+        "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.05),
+    }
+
+    output = np.zeros_like(buffers["v3"])
+
+    for ob_ci in range(8):
+        o0 = (ob * 8 + ob_ci) * 64
+        acc = np.zeros((rows, 64), dtype=np.float32)
+        for kb in range(40):
+            k0 = kb * 128
+            attn_chunk = load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=rows, cols=128, row_stride=5120)
+            attn_chunk = bf16_to_float32(float32_to_bf16(attn_chunk))
+            w_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v4"], offset=k0 * 5120 + o0, rows=128, cols=64, row_stride=5120)
+            )
+            acc += attn_chunk @ w_chunk
+        resid = bf16_to_float32(
+            load_strided_2d(buffers["v2"], offset=b0 * 5120 + o0, rows=rows, cols=64, row_stride=5120)
+        )
+        output = store_strided_2d(output, acc + resid, offset=o0, row_stride=5120)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v3": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
deleted file mode 100644
index 9a8a29a01..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
+++ /dev/null
@@ -1,118 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_11(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<f32>, %arg7: !pto.ptr<f32>, %arg8: !pto.ptr<f32>, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c32 = arith.constant 32 : i64
-  %c64 = arith.constant 64 : i64
-  %c96 = arith.constant 96 : i64
-  %c128 = arith.constant 128 : i64
-  %c4224 = arith.constant 4224 : i64
-  %c8320 = arith.constant 8320 : i64
-  %c12416 = arith.constant 12416 : i64
-  %c12448 = arith.constant 12448 : i64
-  %c12480 = arith.constant 12480 : i64
-  %c12512 = arith.constant 12512 : i64
-  %c12544 = arith.constant 12544 : i64
-  %c12576 = arith.constant 12576 : i64
-  %c12608 = arith.constant 12608 : i64
-  %c8 = arith.constant 8 : index
-  %c1 = arith.constant 1 : index
-  %7 = arith.constant 128 : index
-  %c16 = arith.constant 16 : index
-  %c0 = arith.constant 0 : index
-  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %li__phi_v5 = pto.alloc_tile addr = %c12416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %mi__phi_v5 = pto.alloc_tile addr = %c12448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %oi__phi_v5 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %8 = arith.cmpi eq, %arg9, %c0 : index
-  scf.if %8 {
-    %oi__ssa_v3 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %li__ssa_v3 = pto.alloc_tile addr = %c12416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %mi__ssa_v3 = pto.alloc_tile addr = %c12448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  } else {
-    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %mi_new__tile = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %alpha__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %beta__tile = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %2 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %3 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %mi__ssa_v4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  }
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
deleted file mode 100644
index a9c4f9bee..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
+++ /dev/null
@@ -1,31 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_12(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c32 = arith.constant 32 : i64
-  %c4128 = arith.constant 4128 : i64
-  %c1 = arith.constant 1 : index
-  %c8192 = arith.constant 8192 : index
-  %c8 = arith.constant 8 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %c1024 = arith.constant 1024 : index
-  %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %ctx_flat_bf16__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcvt ins(%ctx_flat__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx_flat_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %0 = arith.muli %arg3, %c128 : index
-  %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x1024xbf16>
-  pto.tstore ins(%ctx_flat_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
index 8b38aaf7e..a93fb36df 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto
@@ -1,21 +1,116 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_13(%arg0: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @qwen3_decode_layer_incore_13_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
   %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c20480 = arith.constant 20480 : i64
   %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
+  %c64 = arith.constant 64 : index
   %c1 = arith.constant 1 : index
+  %c5120 = arith.constant 5120 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-    %c64 = arith.constant 64 : index
-    %cst = arith.constant 0.000000e+00 : f32
-  %resid1_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  scf.for %ob__idx_v0 = %c0 to %c128 step %c1 {
-    %0 = arith.muli %ob__idx_v0, %c64 : index
-    %zero_resid1__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texpands ins(%cst : f32) outs(%zero_resid1__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %resid1_tile__iter_v1_pview = pto.partition_view %resid1_tile__ssa_v0_view, offsets = [%c0, %0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-    pto.tstore ins(%zero_resid1__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
+  %c40 = arith.constant 40 : index
+    %c128 = arith.constant 128 : index
+  %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_13_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_13_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%post_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wg__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wu__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
   }
   return
   }
+  func.func @qwen3_decode_layer_incore_13_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c32768 = arith.constant 32768 : i64
+  %c36864 = arith.constant 36864 : i64
+  %c45056 = arith.constant 45056 : i64
+  %c49152 = arith.constant 49152 : i64
+  %c40960 = arith.constant 40960 : i64
+  %c53248 = arith.constant 53248 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c5120 = arith.constant 5120 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c40 = arith.constant 40 : index
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_13_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", size = 32768, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %gate_acc__ssa_v0_pview = pto.partition_view %gate_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tload ins(%gate_acc__ssa_v0_pview : !pto.partition_tensor_view<16x64xf32>) outs(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %up_acc__ssa_v0_pview = pto.partition_view %up_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tload ins(%up_acc__ssa_v0_pview : !pto.partition_tensor_view<16x64xf32>) outs(%up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%gate_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%up_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %2 = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %3 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c49152 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%1, %3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gate_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%up_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tneg ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %5 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %6 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%5, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trecip ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %7 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%0, %sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%7, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%mlp_chunk__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>)
+  return
+  }
 }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
new file mode 100644
index 000000000..61c671b01
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    rows = 16
+    meta = load_case_meta()
+    generator = rng()
+    o0 = load_int32_assignments()[0]
+
+    buffers = {
+        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01),
+        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01),
+        "v3": make_fp32(generator, meta.elem_counts["v3"], scale=0.01),
+        "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.01),
+        "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.01),
+        "v6": np.zeros(meta.elem_counts["v6"], dtype=meta.np_types["v6"]),
+    }
+
+    gate_acc = np.zeros((rows, 64), dtype=np.float32)
+    up_acc = np.zeros((rows, 64), dtype=np.float32)
+
+    for kb in range(40):
+        k0 = kb * 128
+        post_chunk = bf16_to_float32(
+            load_strided_2d(buffers["v2"], offset=k0, rows=rows, cols=128, row_stride=5120)
+        )
+        w_gate = bf16_to_float32(
+            load_strided_2d(buffers["v4"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600)
+        )
+        w_up = bf16_to_float32(
+            load_strided_2d(buffers["v5"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600)
+        )
+        gate_acc += post_chunk @ w_gate
+        up_acc += post_chunk @ w_up
+
+    sigmoid = np.reciprocal(1.0 + np.exp(-gate_acc))
+    mlp_chunk = gate_acc * sigmoid * up_acc
+    output = float32_to_bf16(mlp_chunk)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v6": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
index 0a0172824..5c06dda13 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto
@@ -1,90 +1,74 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
   %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
+  %c16384 = arith.constant 16384 : i64
   %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
+  %c5120 = arith.constant 5120 : index
   %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c64 = arith.constant 64 : index
-      %c128 = arith.constant 128 : index
-  %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
   %qwen3_decode_layer_incore_14_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_14_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
-    %0 = arith.muli %arg5, %c8 : index
-    %1 = arith.addi %0, %ob__ci_idx_v0 : index
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
     %2 = arith.muli %1, %c1 : index
     %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c64 : index
-    scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
-      %5 = arith.muli %kb__idx_v0, %c128 : index
-      %a_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %attn_out__rv_v5_pview = pto.partition_view %attn_out__rv_v5_view, offsets = [%arg4, %5], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-      pto.tload ins(%attn_out__rv_v5_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%a_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-      pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-      pto.tmov ins(%a_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%a_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-      %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-      pto.tmov ins(%w_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-      pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-    }
+    %4 = arith.muli %3, %c128 : index
+    %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+    pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_down_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
   }
   return
   }
-  func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c32768 = arith.constant 32768 : i64
-  %c40960 = arith.constant 40960 : i64
-  %c45056 = arith.constant 45056 : i64
-  %c36864 = arith.constant 36864 : i64
+  func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c65536 = arith.constant 65536 : i64
   %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
+  %c5120 = arith.constant 5120 : index
   %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c64 = arith.constant 64 : index
-    %cst = arith.constant 0.000000e+00 : f32
-  %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 32768, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
-    %2 = arith.muli %arg5, %c8 : index
-    %3 = arith.addi %2, %ob__ci_idx_v0 : index
-    %4 = arith.muli %3, %c1 : index
-    %5 = arith.addi %c0, %4 : index
-    %6 = arith.muli %5, %c64 : index
-    %o_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texpands ins(%cst : f32) outs(%o_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
-      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      %0 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tadd ins(%o_acc__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-      pto.tfree_from_aic {split = 0}
-      %o_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tmov ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    }
-    %t__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %6], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
-    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %resid__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%o_acc__tile, %resid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %resid1_tile__co_l1_iter_v4_pview = pto.partition_view %resid1_tile__co_l0_iter_v4_view, offsets = [%c0, %6], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%resid1_tile__co_l1_iter_v4_pview : !pto.partition_tensor_view<16x64xf32>)
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 65536, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tstore ins(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.partition_tensor_view<16x128xf32>)
   }
   return
   }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
new file mode 100644
index 000000000..9994d6990
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def main():
+    rows = 16
+    meta = load_case_meta()
+    generator = rng()
+    dob, o0 = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01),
+        "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01),
+        "v3": make_bf16(generator, meta.elem_counts["v3"], scale=0.01),
+    }
+
+    output = np.array(buffers["v1"], copy=True)
+    mlp_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=0, rows=rows, cols=64, row_stride=64))
+
+    for dob_ci in range(4):
+        d0 = (dob * 4 + dob_ci) * 128
+        down_prev = load_strided_2d(output, offset=d0, rows=rows, cols=128, row_stride=5120).astype(np.float32)
+        w_down = bf16_to_float32(
+            load_strided_2d(buffers["v3"], offset=o0 * 5120 + d0, rows=64, cols=128, row_stride=5120)
+        )
+        output = store_strided_2d(output, down_prev + mlp_chunk @ w_down, offset=d0, row_stride=5120)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v1": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
deleted file mode 100644
index a45c9a509..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
+++ /dev/null
@@ -1,47 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_15(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c64 = arith.constant 64 : i64
-  %c8256 = arith.constant 8256 : i64
-  %c16448 = arith.constant 16448 : i64
-  %c16512 = arith.constant 16512 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %5 = arith.constant 64 : index
-    %c128 = arith.constant 128 : index
-  %cst_1 = arith.constant 1.220703e-04 : f32
-  %cst_2 = arith.constant 1.000000e-06 : f32
-  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sq_sum__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%sq_sum__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %kb__idx_v0 = %c0 to %5 step %c1 {
-    %6 = arith.muli %kb__idx_v0, %c128 : index
-    %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %6], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %tmp_tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowsum ins(%t__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %2 = pto.alloc_tile addr = %c16512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%sq_sum__tile, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %sq_sum__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sq_sum__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  }
-  %3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmuls ins(%sq_sum__tile, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tadds ins(%3, %cst_2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.trsqrt ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
-  pto.tstore ins(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<1x16xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
deleted file mode 100644
index f9fa660d1..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
+++ /dev/null
@@ -1,49 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_16(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c64 = arith.constant 64 : i64
-  %c8256 = arith.constant 8256 : i64
-  %c8768 = arith.constant 8768 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %2 = arith.constant 64 : index
-    %c128 = arith.constant 128 : index
-    %cst = arith.constant 0.000000e+00 : f32
-  %down_proj_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %inv_rms__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %post_norm_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %post_rms_weight__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg4, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %inv_rms__ssa_v0_pview = pto.partition_view %inv_rms__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
-  pto.tload ins(%inv_rms__ssa_v0_pview : !pto.partition_tensor_view<1x16xf32>) outs(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %zi__idx_v0 = %c0 to %2 step %c1 {
-    %3 = arith.muli %zi__idx_v0, %c128 : index
-    %down_zero_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.texpands ins(%cst : f32) outs(%down_zero_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %down_proj_tile__iter_v1_pview = pto.partition_view %down_proj_tile__ssa_v0_view, offsets = [%c0, %3], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tstore ins(%down_zero_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_proj_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>)
-  }
-  scf.for %kb__idx_v0 = %c0 to %2 step %c1 {
-    %4 = arith.muli %kb__idx_v0, %c128 : index
-    %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %gamma__tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %post_rms_weight__ssa_v0_pview = pto.partition_view %post_rms_weight__ssa_v0_view, offsets = [%c0, %4], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-    pto.tload ins(%post_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    %0 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowexpandmul ins(%x_chunk__tile, %t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %normed__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%0, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c8768 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %post_norm_tile__iter_v1_pview = pto.partition_view %post_norm_tile__ssa_v0_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%post_norm_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
deleted file mode 100644
index ae6570c56..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
+++ /dev/null
@@ -1,104 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_17_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c20480 = arith.constant 20480 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c25600 = arith.constant 25600 : index
-  %c64 = arith.constant 64 : index
-  %c0_i32 = arith.constant 0 : i32
-  %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_17_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_17_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
-    %1 = arith.muli %kb__idx_v0, %c128 : index
-    %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%post_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wg__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-    %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wu__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-    pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-  }
-  return
-  }
-  func.func @qwen3_decode_layer_incore_17_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c32768 = arith.constant 32768 : i64
-  %c36864 = arith.constant 36864 : i64
-  %c45056 = arith.constant 45056 : i64
-  %c49152 = arith.constant 49152 : i64
-  %c40960 = arith.constant 40960 : i64
-  %c53248 = arith.constant 53248 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c25600 = arith.constant 25600 : index
-  %c64 = arith.constant 64 : index
-  %c0_i32 = arith.constant 0 : i32
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %cst_1 = arith.constant 1.000000e+00 : f32
-  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_17_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", size = 32768, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
-  %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
-    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %0 = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%gate_acc__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tfree_from_aic {split = 0}
-    %1 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %2 = pto.alloc_tile addr = %c49152 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%up_acc__tile, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tfree_from_aic {split = 0}
-    %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gate_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%up_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  }
-  %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tneg ins(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %3 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %4 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tadds ins(%3, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.trecip ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %5 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmul ins(%gate_acc__tile, %sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmul ins(%5, %up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tcvt ins(%mlp_chunk__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
-  pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
deleted file mode 100644
index 3228a9f80..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
+++ /dev/null
@@ -1,75 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_18_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c16384 = arith.constant 16384 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c25600 = arith.constant 25600 : index
-  %c0_i32 = arith.constant 0 : i32
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_18_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_18_aiv} -> i32
-  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %0 = arith.muli %arg3, %c4 : index
-    %1 = arith.addi %0, %dob__ci_idx_v0 : index
-    %2 = arith.muli %1, %c1 : index
-    %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c128 : index
-    %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
-    pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
-    pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_down_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-  }
-  return
-  }
-  func.func @qwen3_decode_layer_incore_18_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c65536 = arith.constant 65536 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c25600 = arith.constant 25600 : index
-  %c0_i32 = arith.constant 0 : i32
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %qwen3_decode_layer_incore_18_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", size = 65536, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
-  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
-  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %0 = arith.muli %arg3, %c4 : index
-    %1 = arith.addi %0, %dob__ci_idx_v0 : index
-    %2 = arith.muli %1, %c1 : index
-    %3 = arith.addi %c0, %2 : index
-    %4 = arith.muli %3, %c128 : index
-    %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    pto.tfree_from_aic {split = 0}
-    %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tstore ins(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.partition_tensor_view<16x128xf32>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
deleted file mode 100644
index 776c7aed2..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
+++ /dev/null
@@ -1,36 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_19(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c8192 = arith.constant 8192 : i64
-  %c16384 = arith.constant 16384 : i64
-  %c16 = arith.constant 16 : index
-  %2 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-  %down_proj_tile__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %out__co_l0_iter_v3_view = pto.make_tensor_view %arg1, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
-    %3 = arith.muli %arg4, %c4 : index
-    %4 = arith.addi %3, %ob__ci_idx_v0 : index
-    %5 = arith.muli %4, %c1 : index
-    %6 = arith.addi %c0, %5 : index
-    %7 = arith.muli %6, %c128 : index
-    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %down_proj_tile__rv_v5_pview = pto.partition_view %down_proj_tile__rv_v5_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tload ins(%down_proj_tile__rv_v5_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
-    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %down_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%down_acc__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %out__co_l1_iter_v3_pview = pto.partition_view %out__co_l0_iter_v3_view, offsets = [%arg3, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x128xbf16>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
new file mode 100644
index 000000000..0952f032b
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py
@@ -0,0 +1,77 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray:
+    if positive:
+        return generator.uniform(0.5, 1.5, size=count).astype(np.float32)
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray:
+    return bf16_to_float32(float32_to_bf16(values))
+
+
+def main():
+    rows = 16
+    meta = load_case_meta()
+    generator = rng()
+    b0, ob = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05),
+        "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True),
+        "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True),
+        "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]),
+        "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.05),
+    }
+
+    inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(rows, 1)
+    output = np.zeros_like(buffers["v4"])
+
+    for ob_ci in range(4):
+        q0 = (ob * 4 + ob_ci) * 64
+        acc = np.zeros((rows, 64), dtype=np.float32)
+        for kb in range(40):
+            k0 = kb * 128
+            x_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=rows, cols=128, row_stride=5120)
+            )
+            gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
+            normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma)
+            w_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120)
+            )
+            acc += normed @ w_chunk
+        output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v4": output})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
index 9fbf4425d..795b5dee7 100644
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto
@@ -1,67 +1,145 @@
 module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_2(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @qwen3_decode_layer_incore_2_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: !pto.ptr<bf16>, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c16384 = arith.constant 16384 : i64
+  %c32768 = arith.constant 32768 : i64
   %c0i = arith.constant 0 : i64
-  %c64 = arith.constant 64 : i64
-  %c4160 = arith.constant 4160 : i64
-  %c12352 = arith.constant 12352 : i64
-  %c20544 = arith.constant 20544 : i64
-  %c20608 = arith.constant 20608 : i64
-  %c20672 = arith.constant 20672 : i64
   %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
+  %c5120 = arith.constant 5120 : index
   %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
+  %c1024 = arith.constant 1024 : index
   %c0 = arith.constant 0 : index
-  %10 = arith.constant 64 : index
-    %c128 = arith.constant 128 : index
-  %cst_1 = arith.constant 1.220703e-04 : f32
-  %cst_2 = arith.constant 1.000000e-06 : f32
-  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %normed_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %partial_sq__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%partial_sq__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %kb__idx_v0 = %c0 to %10 step %c1 {
-    %11 = arith.muli %kb__idx_v0, %c128 : index
-    %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %11], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %x_chunk__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %tmp_tile = pto.alloc_tile addr = %c12352 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %1 = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowsum ins(%0, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-    %2 = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %3 = pto.alloc_tile addr = %c20608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%partial_sq__tile, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %partial_sq__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%partial_sq__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_2_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", size = 16384, location = #pto.address_space<mat>, auto = false, base = 0} -> i32
+  %qwen3_decode_layer_incore_2_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer : i32)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.muli %arg8, %c8 : index
+    %2 = arith.addi %1, %ob__ci_idx_v0 : index
+    %3 = arith.muli %2, %c1 : index
+    %4 = arith.addi %c0, %3 : index
+    %5 = arith.muli %4, %c64 : index
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %6 = arith.muli %kb__idx_v0, %c128 : index
+      %wk_chunk__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wk_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %wv_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+      pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wv_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      %normed_bf16__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      %normed_bf16__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%normed_bf16__tile_Left_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%normed_bf16__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tfree_from_aiv {split = 0}
+      %wk_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%wk_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wk_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%normed_bf16__tile_Left, %wk_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+      %wv_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+      pto.tmov ins(%wv_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wv_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tmatmul ins(%normed_bf16__tile_Left, %wv_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    }
+  }
+  return
   }
-  %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tmuls ins(%partial_sq__tile, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.tadds ins(%4, %cst_2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %variance__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  scf.for %12 = %c0 to %10 step %c1 {
-    %13 = arith.muli %12, %c128 : index
-    %6 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %14 = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %13], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tload ins(%14 : !pto.partition_tensor_view<16x128xbf16>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %7 = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%6{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %gamma__tile = pto.alloc_tile addr = %c20672 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-    pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %8 = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.trowexpandmul ins(%7, %variance__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%8 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %normed__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%8, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %9 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %normed_tile__iter_v1_pview = pto.partition_view %normed_tile__ssa_v0_view, offsets = [%c0, %13], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tstore ins(%9 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>)
+  func.func @qwen3_decode_layer_incore_2_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<bf16>, %arg5: !pto.ptr<bf16>, %arg6: !pto.ptr<bf16>, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c16384 = arith.constant 16384 : i64
+  %c16448 = arith.constant 16448 : i64
+  %c20544 = arith.constant 20544 : i64
+  %c24640 = arith.constant 24640 : i64
+  %c28736 = arith.constant 28736 : i64
+  %c36928 = arith.constant 36928 : i64
+  %c37440 = arith.constant 37440 : i64
+  %c45632 = arith.constant 45632 : i64
+  %c49728 = arith.constant 49728 : i64
+  %c53824 = arith.constant 53824 : i64
+  %c16 = arith.constant 16 : index
+  %c5120 = arith.constant 5120 : index
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c40 = arith.constant 40 : index
+      %c128 = arith.constant 128 : index
+  %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_2_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aic} -> i32
+  %qwen3_decode_layer_incore_2_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", size = 16384, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer_import : i32)
+  %inv_rms_tile__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 {
+    %8 = arith.muli %arg8, %c8 : index
+    %9 = arith.addi %8, %ob__ci_idx_v0 : index
+    %10 = arith.muli %9, %c1 : index
+    %11 = arith.addi %c0, %10 : index
+    %12 = arith.muli %11, %c64 : index
+    %k_acc__tile = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %v_acc__tile = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%k_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmuls ins(%v_acc__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    scf.for %kb__idx_v0 = %c0 to %c40 step %c1 {
+      %13 = arith.muli %kb__idx_v0, %c128 : index
+      %t__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg7, %13], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+      pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %x_chunk__tile = pto.alloc_tile addr = %c28736 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %gamma__tile = pto.alloc_tile addr = %c36928 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+      pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %2 = pto.alloc_tile addr = %c28736 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed__tile = pto.alloc_tile addr = %c28736 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcolexpandmul ins(%2, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed_bf16__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %normed_bf16__tile_nz = pto.alloc_tile addr = %c37440 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+      pto.tmov ins(%normed_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed_bf16__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tpush_to_aic(%normed_bf16__tile_nz : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) {split = 0}
+      %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %3 = pto.alloc_tile addr = %c45632 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %4 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %5 = pto.alloc_tile addr = %c49728 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tadd ins(%1, %4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 0}
+      %k_acc__tile_mv = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      %v_acc__tile_mv = pto.alloc_tile addr = %c20544 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    %6 = pto.alloc_tile addr = %c53824 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%0{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_proj__co_l1_iter_v3_pview = pto.partition_view %k_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x64xbf16>)
+    %7 = pto.alloc_tile addr = %c53824 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%1{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%7 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %v_proj__co_l1_iter_v3_pview = pto.partition_view %v_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tstore ins(%7 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x64xbf16>)
   }
   return
   }
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
new file mode 100644
index 000000000..a7ffaa1e0
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    bf16_to_float32,
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    load_strided_2d,
+    rng,
+    store_strided_2d,
+    write_buffers,
+    write_golden,
+)
+
+
+def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray:
+    if positive:
+        return generator.uniform(0.5, 1.5, size=count).astype(np.float32)
+    return generator.uniform(-scale, scale, size=count).astype(np.float32)
+
+
+def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray:
+    return float32_to_bf16(make_fp32(generator, count, scale=scale))
+
+
+def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray:
+    return bf16_to_float32(float32_to_bf16(values))
+
+
+def main():
+    rows = 16
+    meta = load_case_meta()
+    generator = rng()
+    b0, ob = load_int32_assignments()[:2]
+
+    buffers = {
+        "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05),
+        "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True),
+        "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True),
+        "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]),
+        "v5": np.zeros(meta.elem_counts["v5"], dtype=meta.np_types["v5"]),
+        "v6": make_bf16(generator, meta.elem_counts["v6"], scale=0.05),
+        "v7": make_bf16(generator, meta.elem_counts["v7"], scale=0.05),
+    }
+
+    inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(rows, 1)
+    k_proj = np.zeros_like(buffers["v4"])
+    v_proj = np.zeros_like(buffers["v5"])
+
+    for ob_ci in range(8):
+        kv0 = (ob * 8 + ob_ci) * 64
+        k_acc = np.zeros((rows, 64), dtype=np.float32)
+        v_acc = np.zeros((rows, 64), dtype=np.float32)
+        for kb in range(40):
+            k0 = kb * 128
+            x_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=rows, cols=128, row_stride=5120)
+            )
+            gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32)
+            normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma)
+            wk_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
+            )
+            wv_chunk = bf16_to_float32(
+                load_strided_2d(buffers["v7"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024)
+            )
+            k_acc += normed @ wk_chunk
+            v_acc += normed @ wv_chunk
+        k_proj = store_strided_2d(k_proj, float32_to_bf16(k_acc), offset=b0 * 1024 + kv0, row_stride=1024)
+        v_proj = store_strided_2d(v_proj, float32_to_bf16(v_acc), offset=b0 * 1024 + kv0, row_stride=1024)
+
+    write_buffers(meta, buffers)
+    write_golden(meta, {"v4": k_proj, "v5": v_proj})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
deleted file mode 100644
index f8bccc6ac..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
+++ /dev/null
@@ -1,45 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_3(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c64 = arith.constant 64 : index
-  %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %q_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %wq__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_b__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-  pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_b__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%tile_b__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_b__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %q_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%tile_a__tile_Left, %tile_b__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%q_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
-    %1 = arith.muli %kb__idx_v0, %c128 : index
-    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_b_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %3 = pto.partition_view %wq__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_b_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%tile_b_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_b_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_b_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  }
-  %q_proj__iter_v6_pview = pto.partition_view %q_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%q_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%q_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
deleted file mode 100644
index 9a2756f1c..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
+++ /dev/null
@@ -1,46 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_4(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c16 = arith.constant 16 : index
-  %c1024 = arith.constant 1024 : index
-  %c1 = arith.constant 1 : index
-  %c8192 = arith.constant 8192 : index
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c64 = arith.constant 64 : index
-  %k_proj__iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %normed_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %wk__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_wk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-  pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_wk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%tile_wk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %k_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%tile_a__tile_Left, %tile_wk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
-    %1 = arith.muli %kb__idx_v0, %c128 : index
-    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_wk_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %3 = pto.partition_view %wk__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_wk_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%tile_wk_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wk_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wk_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  }
-  %k_proj__iter_v6_pview = pto.partition_view %k_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%k_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%k_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
deleted file mode 100644
index db88c9a68..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
+++ /dev/null
@@ -1,46 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_5(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c16 = arith.constant 16 : index
-  %c8192 = arith.constant 8192 : index
-  %c1 = arith.constant 1 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c64 = arith.constant 64 : index
-  %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %wv__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_wv__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-  pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %tile_wv__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%tile_wv__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wv__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %v_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%tile_a__tile_Left, %tile_wv__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%v_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
-    %1 = arith.muli %kb__idx_v0, %c128 : index
-    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_wv_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %3 = pto.partition_view %wv__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-    %tile_wv_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-    pto.tmov ins(%tile_wv_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wv_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wv_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  }
-  %v_proj__iter_v6_pview = pto.partition_view %v_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%v_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%v_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
deleted file mode 100644
index 4443956bc..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
+++ /dev/null
@@ -1,88 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_6(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<bf16>, %arg7: !pto.ptr<f32>, %arg8: index, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c256 = arith.constant 256 : i64
-  %c512 = arith.constant 512 : i64
-  %c768 = arith.constant 768 : i64
-  %c1024 = arith.constant 1024 : i64
-  %c1280 = arith.constant 1280 : i64
-  %c1536 = arith.constant 1536 : i64
-  %c1792 = arith.constant 1792 : i64
-  %c2048 = arith.constant 2048 : i64
-  %c2176 = arith.constant 2176 : i64
-  %c2688 = arith.constant 2688 : i64
-  %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c524288 = arith.constant 524288 : index
-  %c128 = arith.constant 128 : index
-  %c16 = arith.constant 16 : index
-  %7 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-    %c4096 = arith.constant 4096 : index
-  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %k_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %k_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %v_cache__iter_v1_view = pto.make_tensor_view %arg6, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %v_proj__rv_v5_view = pto.make_tensor_view %arg7, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
-    %8 = arith.muli %ki__idx_v0, %c128 : index
-    %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %k_proj__rv_v5_pview = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %8], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-    pto.tload ins(%k_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %k_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %10 = arith.addi %8, %c64 : index
-    %9 = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %10], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-    pto.tload ins(%9 : !pto.partition_tensor_view<1x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %rot_lo__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %11 = arith.muli %arg8, %c8 : index
-    %12 = arith.muli %11, %c4096 : index
-    %13 = arith.muli %ki__idx_v0, %c4096 : index
-    %14 = arith.addi %12, %13 : index
-    %15 = arith.addi %14, %arg9 : index
-    %3 = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%rot_lo__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
-    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x64xbf16>)
-    %4 = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%rot_hi__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %k_cache__tile_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c64], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
-    pto.tstore ins(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__tile_pview : !pto.partition_tensor_view<1x64xbf16>)
-    %5 = pto.alloc_tile addr = %c2176 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %17 = arith.muli %ki__idx_v0, %c128 : index
-    %v_proj__rv_v5_pview = pto.partition_view %v_proj__rv_v5_view, offsets = [%arg8, %17], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
-    pto.tload ins(%v_proj__rv_v5_pview : !pto.partition_tensor_view<1x128xf32>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %6 = pto.alloc_tile addr = %c2688 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%5{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
-    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
-  }
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
deleted file mode 100644
index 2f80eb162..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
+++ /dev/null
@@ -1,92 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_7(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<f32>, %arg7: !pto.ptr<f32>, %arg8: !pto.ptr<f32>, %arg9: index, %arg10: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c256 = arith.constant 256 : i64
-  %c512 = arith.constant 512 : i64
-  %c768 = arith.constant 768 : i64
-  %c1024 = arith.constant 1024 : i64
-  %c1280 = arith.constant 1280 : i64
-  %c1536 = arith.constant 1536 : i64
-  %c1792 = arith.constant 1792 : i64
-  %c2048 = arith.constant 2048 : i64
-  %c2176 = arith.constant 2176 : i64
-  %c2304 = arith.constant 2304 : i64
-  %c6400 = arith.constant 6400 : i64
-  %c6432 = arith.constant 6432 : i64
-  %c1 = arith.constant 1 : index
-  %c64 = arith.constant 64 : index
-  %c16 = arith.constant 16 : index
-  %c128 = arith.constant 128 : index
-  %c8192 = arith.constant 8192 : index
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %q_padded__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %q_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  scf.for %qi__idx_v0 = %c0 to %c8 step %c1 {
-    %5 = arith.addi %arg10, %qi__idx_v0 : index
-    %6 = arith.muli %5, %c128 : index
-    %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %q_proj__rv_v5_pview = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %6], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-    pto.tload ins(%q_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%q_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %q_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %8 = arith.addi %6, %c64 : index
-    %7 = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %8], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
-    pto.tload ins(%7 : !pto.partition_tensor_view<1x64xf32>) outs(%q_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %1 = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %rot_lo_bf16__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%1{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %2 = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %3 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %4 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%2, %3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %rot_hi_bf16__tile = pto.alloc_tile addr = %c2176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tcvt ins(%4{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-    %q_padded__iter_v1_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
-    pto.tstore ins(%rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__iter_v1_pview : !pto.partition_tensor_view<1x64xbf16>)
-    %q_padded__tile_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c64], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
-    pto.tstore ins(%rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__tile_pview : !pto.partition_tensor_view<1x64xbf16>)
-  }
-  %oi__tile = pto.alloc_tile addr = %c2304 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %li_flat__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %li__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %mi_flat__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  %mi__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
-  pto.tstore ins(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
deleted file mode 100644
index 53988ea99..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
+++ /dev/null
@@ -1,30 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_8(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-  %c0i = arith.constant 0 : i64
-  %c16384 = arith.constant 16384 : i64
-  %c524288 = arith.constant 524288 : index
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c0 = arith.constant 0 : index
-  %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xbf16>
-  %q_padded__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
-  pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  %q_padded__rv_v2_pview = pto.partition_view %q_padded__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
-  pto.tload ins(%q_padded__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
-  %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
-  pto.tmov ins(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
-  %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-  pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
-  pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>)
-  return
-  }
-}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto
deleted file mode 100644
index eb677daf6..000000000
--- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto
+++ /dev/null
@@ -1,49 +0,0 @@
-module attributes {pto.target_arch = "a5"} {
-  func.func @qwen3_decode_layer_incore_9(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-  %c0i = arith.constant 0 : i64
-  %c2048 = arith.constant 2048 : i64
-  %c4096 = arith.constant 4096 : i64
-  %c8192 = arith.constant 8192 : i64
-  %c8224 = arith.constant 8224 : i64
-  %c9248 = arith.constant 9248 : i64
-  %c16 = arith.constant 16 : index
-  %c64 = arith.constant 64 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 8.838835e-02 : f32
-  %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
-  %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
-  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %ret1__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
-  %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
-  pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
-  pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%scores_padded__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, f32) outs(%scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tcvt ins(%exp_scores__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
-  pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_fp32__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
-  %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-  %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
-  pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
-  %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
-  pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
-  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
-  pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
-  return
-  }
-}

From c5eff1ff9fb1197b961c34c85c752b843579c422 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Wed, 8 Apr 2026 17:12:49 +0800
Subject: [PATCH 10/16] test: emit mixed wrappers in kernel TU

---
 .../scripts/generate_testcase.py              | 88 +++++++++----------
 1 file changed, 43 insertions(+), 45 deletions(-)

diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
index f1e3aef77..bf5dedf79 100644
--- a/test/npu_validation/scripts/generate_testcase.py
+++ b/test/npu_validation/scripts/generate_testcase.py
@@ -226,6 +226,24 @@ def _describe_kernel_source(text: str):
     }
 
 
+def _append_mixed_kernel_wrapper(
+    kernel_text: str,
+    kernel_name: str,
+    raw_params: list[str],
+    aic_name: str,
+    aiv_name: str,
+) -> str:
+    wrapper_call_args = ", ".join(_extract_cpp_name(param) for param in raw_params)
+    wrapper = (
+        "\n\n"
+        f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n"
+        f"    {aic_name}({wrapper_call_args});\n"
+        f"    {aiv_name}({wrapper_call_args});\n"
+        "}\n"
+    )
+    return kernel_text.rstrip() + wrapper
+
+
 def _is_gm_pointer_param(param: str) -> bool:
     return "__gm__" in param and "*" in param
 
@@ -1613,6 +1631,15 @@ def generate_testcase(
                     logical_elem_count=logical_elem_count,
                 )
 
+    if is_mixed_kernel:
+        kernel_text_out = _append_mixed_kernel_wrapper(
+            kernel_text_out,
+            kernel_name,
+            raw_params,
+            kernel_info["aic_name"],
+            kernel_info["aiv_name"],
+        )
+
     kernel_out = output_dir / f"{testcase}_kernel.cpp"
     kernel_out.write_text(_replace_includes(kernel_text_out), encoding="utf-8")
 
@@ -1631,51 +1658,22 @@ def generate_testcase(
     kernel_call_args_host = ", ".join(kernel_call_args_host)
     raw_params_host = [_rewrite_host_unsupported_types(p) for p in raw_params]
     launch_block_count = _infer_launch_block_count(raw_kernel_for_analysis, testcase)
-    if is_mixed_kernel:
-        wrapper_call_args = ", ".join([p["name"] for p in params])
-        launch_cpp = (
-            INCLUDE_REPLACEMENT
-            + "\n"
-            "#if defined(__CCE_AICORE__)\n"
-            f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params)});\n"
-            f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params)});\n"
-            f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n"
-            f"    {kernel_info['aic_name']}({wrapper_call_args});\n"
-            f"    {kernel_info['aiv_name']}({wrapper_call_args});\n"
-            "}\n"
-            "#else\n"
-            f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params_host)});\n"
-            f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params_host)});\n"
-            f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)}) {{\n"
-            f"    {kernel_info['aic_name']}({wrapper_call_args});\n"
-            f"    {kernel_info['aiv_name']}({wrapper_call_args});\n"
-            "}\n"
-            "#endif\n\n"
-            f"void {launch_name}({launch_fn_params}) {{\n"
-            "#if defined(__CCE_AICORE__)\n"
-            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n"
-            "#else\n"
-            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n"
-            "#endif\n"
-            f"}}\n"
-        )
-    else:
-        launch_cpp = (
-            INCLUDE_REPLACEMENT
-            + "\n"
-            "#if defined(__CCE_AICORE__)\n"
-            f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n"
-            "#else\n"
-            f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n"
-            "#endif\n\n"
-            f"void {launch_name}({launch_fn_params}) {{\n"
-            "#if defined(__CCE_AICORE__)\n"
-            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n"
-            "#else\n"
-            f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n"
-            "#endif\n"
-            f"}}\n"
-        )
+    launch_cpp = (
+        INCLUDE_REPLACEMENT
+        + "\n"
+        "#if defined(__CCE_AICORE__)\n"
+        f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n"
+        "#else\n"
+        f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n"
+        "#endif\n\n"
+        f"void {launch_name}({launch_fn_params}) {{\n"
+        "#if defined(__CCE_AICORE__)\n"
+        f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n"
+        "#else\n"
+        f"    {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n"
+        "#endif\n"
+        f"}}\n"
+    )
     (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8")
 
     # pto-isa selects instruction implementations based on MEMORY_BASE vs

From 941c760bb1d7c5e8afd88669a3f48ff85321f31b Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Thu, 9 Apr 2026 09:23:04 +0800
Subject: [PATCH 11/16] test: restore full qwen3 tilelet PTO set

---
 test/samples/Qwen3Tilelet/README.md           |  10 +-
 .../qwen3_decode_layer_incore_0.pto           |  23 ++++
 .../qwen3_decode_layer_incore_11.pto          | 118 ++++++++++++++++++
 .../qwen3_decode_layer_incore_12.pto          |  31 +++++
 .../qwen3_decode_layer_incore_15.pto          |  47 +++++++
 .../qwen3_decode_layer_incore_16.pto          |  49 ++++++++
 .../qwen3_decode_layer_incore_17.pto          | 104 +++++++++++++++
 .../qwen3_decode_layer_incore_18.pto          |  75 +++++++++++
 .../qwen3_decode_layer_incore_19.pto          |  36 ++++++
 .../qwen3_decode_layer_incore_3.pto           |  45 +++++++
 .../qwen3_decode_layer_incore_4.pto           |  46 +++++++
 .../qwen3_decode_layer_incore_5.pto           |  46 +++++++
 .../qwen3_decode_layer_incore_6.pto           |  88 +++++++++++++
 .../qwen3_decode_layer_incore_7.pto           |  92 ++++++++++++++
 .../qwen3_decode_layer_incore_8.pto           |  30 +++++
 .../qwen3_decode_layer_incore_9.pto           |  49 ++++++++
 16 files changed, 882 insertions(+), 7 deletions(-)
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
 create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto

diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
index 010e75623..b713c4e28 100644
--- a/test/samples/Qwen3Tilelet/README.md
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -5,10 +5,6 @@ Scope:
 - A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS`
 
 Notes:
-- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs:
-  `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`,
-  `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`,
-  `qwen3_decode_layer_incore_14`.
-- This sample directory vendors only those direct `ptoas` regression inputs, regenerated from the tilelet source with `BATCH_TILE=16`.
-- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation.
-- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `<case>_golden.py`.
+- The source PyPTO program lowers to 20 `qwen3_decode_layer_incore_*.pto` fragments; this directory vendors the full emitted `.pto` set regenerated from the tilelet source with `BATCH_TILE=16`.
+- `test/npu_validation/scripts/generate_testcase.py` wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation when the lowered fragment contains split cube/vector entrypoints.
+- Custom golden assets currently exist only for the board-validation cases that need them and live beside the `.pto` files as `<case>_golden.py`.
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
new file mode 100644
index 000000000..856f60659
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto
@@ -0,0 +1,23 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_0(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %attn_out__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %zero_q__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%zero_q__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %zero_attn__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%zero_q__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%zero_attn__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_proj__iter_v1_pview = pto.partition_view %q_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%zero_q__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>)
+  %attn_out__iter_v1_pview = pto.partition_view %attn_out__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tstore ins(%zero_attn__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_out__iter_v1_pview : !pto.partition_tensor_view<16x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
new file mode 100644
index 000000000..9a8a29a01
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto
@@ -0,0 +1,118 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_11(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<f32>, %arg7: !pto.ptr<f32>, %arg8: !pto.ptr<f32>, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c64 = arith.constant 64 : i64
+  %c96 = arith.constant 96 : i64
+  %c128 = arith.constant 128 : i64
+  %c4224 = arith.constant 4224 : i64
+  %c8320 = arith.constant 8320 : i64
+  %c12416 = arith.constant 12416 : i64
+  %c12448 = arith.constant 12448 : i64
+  %c12480 = arith.constant 12480 : i64
+  %c12512 = arith.constant 12512 : i64
+  %c12544 = arith.constant 12544 : i64
+  %c12576 = arith.constant 12576 : i64
+  %c12608 = arith.constant 12608 : i64
+  %c8 = arith.constant 8 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c12416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c12448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %arg9, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c12416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c12448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c12480 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c12512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c12544 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c12608 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c12576 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
new file mode 100644
index 000000000..a9c4f9bee
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto
@@ -0,0 +1,31 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_12(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c4128 = arith.constant 4128 : i64
+  %c1 = arith.constant 1 : index
+  %c8192 = arith.constant 8192 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c1024 = arith.constant 1024 : index
+  %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %ctx_flat_bf16__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%ctx_flat__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx_flat_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = arith.muli %arg3, %c128 : index
+  %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x1024xbf16>
+  pto.tstore ins(%ctx_flat_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
new file mode 100644
index 000000000..a45c9a509
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto
@@ -0,0 +1,47 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_15(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c8256 = arith.constant 8256 : i64
+  %c16448 = arith.constant 16448 : i64
+  %c16512 = arith.constant 16512 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %5 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+  %cst_1 = arith.constant 1.220703e-04 : f32
+  %cst_2 = arith.constant 1.000000e-06 : f32
+  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sq_sum__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%sq_sum__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %5 step %c1 {
+    %6 = arith.muli %kb__idx_v0, %c128 : index
+    %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %6], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %tmp_tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowsum ins(%t__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %2 = pto.alloc_tile addr = %c16512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%sq_sum__tile, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %sq_sum__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sq_sum__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmuls ins(%sq_sum__tile, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%3, %cst_2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trsqrt ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+  pto.tstore ins(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<1x16xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
new file mode 100644
index 000000000..f9fa660d1
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto
@@ -0,0 +1,49 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_16(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c8256 = arith.constant 8256 : i64
+  %c8768 = arith.constant 8768 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %2 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %cst = arith.constant 0.000000e+00 : f32
+  %down_proj_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %post_norm_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %post_rms_weight__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg4, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %inv_rms__ssa_v0_pview = pto.partition_view %inv_rms__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+  pto.tload ins(%inv_rms__ssa_v0_pview : !pto.partition_tensor_view<1x16xf32>) outs(%inv_rms__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %zi__idx_v0 = %c0 to %2 step %c1 {
+    %3 = arith.muli %zi__idx_v0, %c128 : index
+    %down_zero_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texpands ins(%cst : f32) outs(%down_zero_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %down_proj_tile__iter_v1_pview = pto.partition_view %down_proj_tile__ssa_v0_view, offsets = [%c0, %3], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tstore ins(%down_zero_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_proj_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>)
+  }
+  scf.for %kb__idx_v0 = %c0 to %2 step %c1 {
+    %4 = arith.muli %kb__idx_v0, %c128 : index
+    %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %gamma__tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %post_rms_weight__ssa_v0_pview = pto.partition_view %post_rms_weight__ssa_v0_view, offsets = [%c0, %4], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%post_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%x_chunk__tile, %t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %normed__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%0, %gamma__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%normed__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c8768 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%normed__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %post_norm_tile__iter_v1_pview = pto.partition_view %post_norm_tile__ssa_v0_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%post_norm_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
new file mode 100644
index 000000000..ae6570c56
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto
@@ -0,0 +1,104 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_17_aic(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c20480 = arith.constant 20480 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c25600 = arith.constant 25600 : index
+  %c64 = arith.constant 64 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_17_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_17_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%post_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%post_chunk__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wg__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wg__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%wu__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%wu__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_17_aiv(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c32768 = arith.constant 32768 : i64
+  %c36864 = arith.constant 36864 : i64
+  %c45056 = arith.constant 45056 : i64
+  %c49152 = arith.constant 49152 : i64
+  %c40960 = arith.constant 40960 : i64
+  %c53248 = arith.constant 53248 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c25600 = arith.constant 25600 : index
+  %c64 = arith.constant 64 : index
+  %c0_i32 = arith.constant 0 : i32
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_17_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", size = 32768, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %kb__idx_v0 = %c0 to %c64 step %c1 {
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%gate_acc__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %1 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %2 = pto.alloc_tile addr = %c49152 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%up_acc__tile, %1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gate_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%up_acc__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tneg ins(%gate_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %3 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %4 = pto.alloc_tile addr = %c40960 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadds ins(%3, %cst_1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trecip ins(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %5 = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%gate_acc__tile, %sigmoid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tmul ins(%5, %up_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%mlp_chunk__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
new file mode 100644
index 000000000..3228a9f80
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto
@@ -0,0 +1,75 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_18_aic(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_18_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_18_aiv} -> i32
+  pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+    pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+    pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%w_down_chunk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%t__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+  }
+  return
+  }
+  func.func @qwen3_decode_layer_incore_18_aiv(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c65536 = arith.constant 65536 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c25600 = arith.constant 25600 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %qwen3_decode_layer_incore_18_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", size = 65536, location = #pto.address_space<vec>, auto = false, base = 0} -> i32
+  pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32)
+  scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %0 = arith.muli %arg3, %c4 : index
+    %1 = arith.addi %0, %dob__ci_idx_v0 : index
+    %2 = arith.muli %1, %c1 : index
+    %3 = arith.addi %c0, %2 : index
+    %4 = arith.muli %3, %c128 : index
+    %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
+    %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tstore ins(%down_next__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.partition_tensor_view<16x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
new file mode 100644
index 000000000..776c7aed2
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto
@@ -0,0 +1,36 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_19(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %2 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+    %c128 = arith.constant 128 : index
+  %down_proj_tile__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %out__co_l0_iter_v3_view = pto.make_tensor_view %arg1, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 {
+    %3 = arith.muli %arg4, %c4 : index
+    %4 = arith.addi %3, %ob__ci_idx_v0 : index
+    %5 = arith.muli %4, %c1 : index
+    %6 = arith.addi %c0, %5 : index
+    %7 = arith.muli %6, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %down_proj_tile__rv_v5_pview = pto.partition_view %down_proj_tile__rv_v5_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%down_proj_tile__rv_v5_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+    pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %down_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%down_acc__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%down_acc__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %out__co_l1_iter_v3_pview = pto.partition_view %out__co_l0_iter_v3_view, offsets = [%arg3, %7], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
new file mode 100644
index 000000000..f8bccc6ac
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto
@@ -0,0 +1,45 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_3(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wq__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_b__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_b__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_b__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_b__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %q_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%tile_a__tile_Left, %tile_b__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%q_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_b_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %3 = pto.partition_view %wq__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_b_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_b_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_b_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_b_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  }
+  %q_proj__iter_v6_pview = pto.partition_view %q_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%q_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%q_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
new file mode 100644
index 000000000..9a2756f1c
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto
@@ -0,0 +1,46 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_4(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c1 = arith.constant 1 : index
+  %c8192 = arith.constant 8192 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %k_proj__iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %normed_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %wk__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_wk__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wk__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %k_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%tile_a__tile_Left, %tile_wk__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wk_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %3 = pto.partition_view %wk__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wk_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_wk_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wk_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wk_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  }
+  %k_proj__iter_v6_pview = pto.partition_view %k_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%k_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%k_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
new file mode 100644
index 000000000..db88c9a68
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto
@@ -0,0 +1,46 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_5(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %wv__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wv__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_a__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %tile_wv__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%tile_wv__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wv__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %v_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%tile_a__tile_Left, %tile_wv__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%v_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  scf.for %kb__idx_v0 = %c1 to %c64 step %c1 {
+    %1 = arith.muli %kb__idx_v0, %c128 : index
+    %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+    pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wv_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %3 = pto.partition_view %wv__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+    pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_a_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_a_i__tile_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    %tile_wv_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    pto.tmov ins(%tile_wv_i__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%tile_wv_i__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wv_i__tile_Right : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  }
+  %v_proj__iter_v6_pview = pto.partition_view %v_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%v_acc__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%v_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
new file mode 100644
index 000000000..4443956bc
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto
@@ -0,0 +1,88 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_6(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<bf16>, %arg7: !pto.ptr<f32>, %arg8: index, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c1280 = arith.constant 1280 : i64
+  %c1536 = arith.constant 1536 : i64
+  %c1792 = arith.constant 1792 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c2176 = arith.constant 2176 : i64
+  %c2688 = arith.constant 2688 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %7 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+    %c4096 = arith.constant 4096 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %k_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_cache__iter_v1_view = pto.make_tensor_view %arg6, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__rv_v5_view = pto.make_tensor_view %arg7, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %8 = arith.muli %ki__idx_v0, %c128 : index
+    %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_proj__rv_v5_pview = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %8], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%k_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %10 = arith.addi %8, %c64 : index
+    %9 = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %10], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%9 : !pto.partition_tensor_view<1x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_lo__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %11 = arith.muli %arg8, %c8 : index
+    %12 = arith.muli %11, %c4096 : index
+    %13 = arith.muli %ki__idx_v0, %c4096 : index
+    %14 = arith.addi %12, %13 : index
+    %15 = arith.addi %14, %arg9 : index
+    %3 = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%rot_lo__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x64xbf16>)
+    %4 = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%rot_hi__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_cache__tile_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c64], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%4 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__tile_pview : !pto.partition_tensor_view<1x64xbf16>)
+    %5 = pto.alloc_tile addr = %c2176 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %17 = arith.muli %ki__idx_v0, %c128 : index
+    %v_proj__rv_v5_pview = pto.partition_view %v_proj__rv_v5_view, offsets = [%arg8, %17], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%v_proj__rv_v5_pview : !pto.partition_tensor_view<1x128xf32>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c2688 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%5{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
new file mode 100644
index 000000000..2f80eb162
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto
@@ -0,0 +1,92 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_7(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: !pto.ptr<f32>, %arg7: !pto.ptr<f32>, %arg8: !pto.ptr<f32>, %arg9: index, %arg10: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c1280 = arith.constant 1280 : i64
+  %c1536 = arith.constant 1536 : i64
+  %c1792 = arith.constant 1792 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c2176 = arith.constant 2176 : i64
+  %c2304 = arith.constant 2304 : i64
+  %c6400 = arith.constant 6400 : i64
+  %c6432 = arith.constant 6432 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c16 = arith.constant 16 : index
+  %c128 = arith.constant 128 : index
+  %c8192 = arith.constant 8192 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_padded__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  scf.for %qi__idx_v0 = %c0 to %c8 step %c1 {
+    %5 = arith.addi %arg10, %qi__idx_v0 : index
+    %6 = arith.muli %5, %c128 : index
+    %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %q_proj__rv_v5_pview = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %6], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%q_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%q_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %8 = arith.addi %6, %c64 : index
+    %7 = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %8], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+    pto.tload ins(%7 : !pto.partition_tensor_view<1x64xf32>) outs(%q_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c1536 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_lo_bf16__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%1{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c1280 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %4 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%2, %3 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rot_hi_bf16__tile = pto.alloc_tile addr = %c2176 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%4{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_padded__iter_v1_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__iter_v1_pview : !pto.partition_tensor_view<1x64xbf16>)
+    %q_padded__tile_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c64], sizes = [%c1, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+    pto.tstore ins(%rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__tile_pview : !pto.partition_tensor_view<1x64xbf16>)
+  }
+  %oi__tile = pto.alloc_tile addr = %c2304 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li_flat__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi_flat__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tstore ins(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
new file mode 100644
index 000000000..53988ea99
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_8(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xbf16>
+  %q_padded__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %q_padded__rv_v2_pview = pto.partition_view %q_padded__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%q_padded__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto
new file mode 100644
index 000000000..eb677daf6
--- /dev/null
+++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto
@@ -0,0 +1,49 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @qwen3_decode_layer_incore_9(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c8224 = arith.constant 8224 : i64
+  %c9248 = arith.constant 9248 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 8.838835e-02 : f32
+  %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%scores_padded__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, f32) outs(%scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_fp32__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  return
+  }
+}

From 53776e53ce0395905c632d3834d557d4477aeda0 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Thu, 9 Apr 2026 12:00:48 +0800
Subject: [PATCH 12/16] test: harden mixed-kernel wrapper generation

---
 .../scripts/generate_testcase.py              | 230 +++++++++++++++++-
 1 file changed, 221 insertions(+), 9 deletions(-)

diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
index bf5dedf79..f1afba770 100644
--- a/test/npu_validation/scripts/generate_testcase.py
+++ b/test/npu_validation/scripts/generate_testcase.py
@@ -152,6 +152,66 @@ def _find_matching_brace(text: str, open_brace_index: int) -> Optional[int]:
     return None
 
 
+def _extract_function_body(function_text: str) -> str:
+    brace_index = function_text.find("{")
+    if brace_index < 0:
+        return ""
+    end_index = _find_matching_brace(function_text, brace_index)
+    if end_index is None:
+        return ""
+    body = function_text[brace_index + 1:end_index].strip()
+    body = re.sub(r"\breturn\s*;\s*$", "", body, flags=re.S).rstrip()
+    return body
+
+
+def _strip_ptoas_auto_sync_tail(body: str) -> tuple[str, bool]:
+    pattern = re.compile(
+        r"\n?\s*ptoas_auto_sync_tail\s*\([^;]*\)\s*;\s*$",
+        re.S,
+    )
+    updated = pattern.sub("", body.rstrip())
+    return updated.rstrip(), updated != body.rstrip()
+
+
+def _indent_block(text: str, spaces: int = 4) -> str:
+    prefix = " " * spaces
+    return "\n".join((prefix + line) if line else "" for line in text.splitlines())
+
+
+def _split_cpp_args(text: str):
+    text = text.strip()
+    if not text:
+        return []
+    parts = []
+    depth_angle = 0
+    depth_paren = 0
+    depth_brace = 0
+    depth_bracket = 0
+    start = 0
+    for idx, ch in enumerate(text):
+        if ch == "<":
+            depth_angle += 1
+        elif ch == ">":
+            depth_angle = max(depth_angle - 1, 0)
+        elif ch == "(":
+            depth_paren += 1
+        elif ch == ")":
+            depth_paren = max(depth_paren - 1, 0)
+        elif ch == "{":
+            depth_brace += 1
+        elif ch == "}":
+            depth_brace = max(depth_brace - 1, 0)
+        elif ch == "[":
+            depth_bracket += 1
+        elif ch == "]":
+            depth_bracket = max(depth_bracket - 1, 0)
+        elif ch == "," and depth_angle == 0 and depth_paren == 0 and depth_brace == 0 and depth_bracket == 0:
+            parts.append(text[start:idx].strip())
+            start = idx + 1
+    parts.append(text[start:].strip())
+    return [part for part in parts if part]
+
+
 def _extract_aicore_functions(text: str):
     pattern = re.compile(
         r"(?P<global>__global__\s+)?AICORE\s+void\s+(?P<name>\w+)\s*\((?P<params>[^)]*)\)\s*\{",
@@ -211,8 +271,8 @@ def _describe_kernel_source(text: str):
                 "raw_params": params,
                 "analysis_texts": [group["aic"]["text"], group["aiv"]["text"]],
                 "writer_texts": [group["aiv"]["text"]],
-                "aic_name": group["aic"]["name"],
-                "aiv_name": group["aiv"]["name"],
+                "aic_text": group["aic"]["text"],
+                "aiv_text": group["aiv"]["text"],
                 "call_text": group["aiv"]["text"],
             }
 
@@ -230,15 +290,167 @@ def _append_mixed_kernel_wrapper(
     kernel_text: str,
     kernel_name: str,
     raw_params: list[str],
-    aic_name: str,
-    aiv_name: str,
+    aic_text: str,
+    aiv_text: str,
 ) -> str:
-    wrapper_call_args = ", ".join(_extract_cpp_name(param) for param in raw_params)
+    pipe_decl_pattern = re.compile(
+        r"^(?P<indent>\s*)auto\s+(?P<name>\w+)\s*=\s*(?P<type>TPipe<[^;=]+>)\s*\((?P<args>[^;]*)\)\s*;\s*$",
+        re.M,
+    )
+    param_names = {_extract_cpp_name(param) for param in raw_params}
+    safe_identifiers = {"nullptr", "NULL", "true", "false"}
+
+    def _find_decl_init(prefix: str, name: str):
+        pattern = re.compile(
+            rf"^\s*(?P<type>[^=\n;]+?)\s+{re.escape(name)}\s*=\s*(?P<init>[^;]+);\s*$",
+            re.M,
+        )
+        match = None
+        for current in pattern.finditer(prefix):
+            match = current
+        if match is None:
+            return None, None, None
+        return match.group("type").strip(), match.group("init").strip(), match.start()
+
+    def _render_pointer_init(type_text: str, init_text: str) -> str:
+        expr = init_text.strip()
+        if "*" not in type_text:
+            return expr
+        if expr.startswith("(") or expr.startswith("reinterpret_cast") or expr.startswith("static_cast"):
+            return expr
+        return f"({type_text}){expr}"
+
+    def _resolve_ctor_arg(arg_text: str, prefix: str, depth: int = 0):
+        arg_text = arg_text.strip()
+        if not arg_text:
+            return None
+        if depth > 8:
+            return None
+        if not re.fullmatch(r"[A-Za-z_]\w*", arg_text):
+            return arg_text
+        if arg_text in safe_identifiers:
+            return arg_text
+        if arg_text in param_names:
+            return arg_text
+        type_text, init_text, decl_start = _find_decl_init(prefix, arg_text)
+        if type_text is None or init_text is None:
+            return None
+        resolved_init = init_text
+        if (
+            re.fullmatch(r"[A-Za-z_]\w*", init_text)
+            and init_text not in param_names
+            and init_text not in safe_identifiers
+        ):
+            resolved_init = _resolve_ctor_arg(init_text, prefix[:decl_start], depth + 1)
+            if resolved_init is None:
+                return None
+        return _render_pointer_init(type_text, resolved_init)
+
+    def _extract_pipe_decls(body: str):
+        decls = []
+        for match in pipe_decl_pattern.finditer(body):
+            ctor_args = _split_cpp_args(match.group("args"))
+            prefix = body[:match.start()]
+            resolved_args = []
+            for arg in ctor_args:
+                resolved = _resolve_ctor_arg(arg, prefix)
+                if resolved is None:
+                    break
+                resolved_args.append(resolved)
+            else:
+                decls.append(
+                    {
+                        "name": match.group("name"),
+                        "type_text": match.group("type").strip(),
+                        "ctor_args": tuple(resolved_args),
+                        "span": match.span(),
+                    }
+                )
+        return decls
+
+    def _rewrite_body(body: str, replacements):
+        rewritten = body
+        for replacement in sorted(replacements, key=lambda item: item["span"][0], reverse=True):
+            start, end = replacement["span"]
+            rewritten = rewritten[:start] + rewritten[end:]
+        for replacement in replacements:
+            rewritten = re.sub(
+                rf"\b{re.escape(replacement['old_name'])}\b",
+                replacement["new_name"],
+                rewritten,
+            )
+        return rewritten.strip()
+
+    def _next_shared_name(seed: int, texts: list[str]) -> str:
+        index = seed
+        while True:
+            name = f"__ptoas_shared_pipe{index}"
+            if all(name not in text for text in texts):
+                return name
+            index += 1
+
+    aic_body = _extract_function_body(aic_text)
+    aiv_body = _extract_function_body(aiv_text)
+    aic_body, aic_has_tail = _strip_ptoas_auto_sync_tail(aic_body)
+    aiv_body, aiv_has_tail = _strip_ptoas_auto_sync_tail(aiv_body)
+    aic_decls = _extract_pipe_decls(aic_body)
+    aiv_decls = _extract_pipe_decls(aiv_body)
+
+    shared_pairs = []
+    aiv_by_key = {}
+    for decl in aiv_decls:
+        key = (decl["type_text"], decl["ctor_args"])
+        aiv_by_key.setdefault(key, []).append(decl)
+    for decl in aic_decls:
+        key = (decl["type_text"], decl["ctor_args"])
+        bucket = aiv_by_key.get(key)
+        if not bucket:
+            continue
+        shared_pairs.append((decl, bucket.pop(0)))
+
+    shared_decls = []
+    aic_replacements = []
+    aiv_replacements = []
+    shared_seed = 0
+    texts_for_name_check = [kernel_text, aic_body, aiv_body]
+    for aic_decl, aiv_decl in shared_pairs:
+        shared_name = _next_shared_name(shared_seed, texts_for_name_check)
+        shared_seed += 1
+        texts_for_name_check.append(shared_name)
+        shared_decls.append(
+            f"  auto {shared_name} = {aic_decl['type_text']}({', '.join(aic_decl['ctor_args'])});"
+        )
+        aic_replacements.append(
+            {
+                "old_name": aic_decl["name"],
+                "new_name": shared_name,
+                "span": aic_decl["span"],
+            }
+        )
+        aiv_replacements.append(
+            {
+                "old_name": aiv_decl["name"],
+                "new_name": shared_name,
+                "span": aiv_decl["span"],
+            }
+        )
+
+    wrapper_blocks = []
+    for body in (_rewrite_body(aic_body, aic_replacements), _rewrite_body(aiv_body, aiv_replacements)):
+        if not body:
+            continue
+        wrapper_blocks.append("  {\n" + _indent_block(body) + "\n  }")
+
+    if not wrapper_blocks:
+        return kernel_text
+
     wrapper = (
         "\n\n"
         f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n"
-        f"    {aic_name}({wrapper_call_args});\n"
-        f"    {aiv_name}({wrapper_call_args});\n"
+        + ("\n".join(shared_decls) + ("\n\n" if shared_decls else ""))
+        + "\n".join(wrapper_blocks)
+        + ("\n  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);" if (aic_has_tail or aiv_has_tail) else "")
+        + "\n"
         "}\n"
     )
     return kernel_text.rstrip() + wrapper
@@ -1636,8 +1848,8 @@ def generate_testcase(
             kernel_text_out,
             kernel_name,
             raw_params,
-            kernel_info["aic_name"],
-            kernel_info["aiv_name"],
+            kernel_info["aic_text"],
+            kernel_info["aiv_text"],
         )
 
     kernel_out = output_dir / f"{testcase}_kernel.cpp"

From df70b4ed49a4768aa468b91a4bfaca818257f3e6 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Thu, 9 Apr 2026 14:32:59 +0800
Subject: [PATCH 13/16] fix(emitc): guard a5 nosplit vector pipe consumers

---
 lib/PTO/Transforms/PTOToEmitC.cpp             | 82 ++++++++++++++++++-
 .../basic/tpush_tpop_frontend_lowering_a5.pto |  2 +
 2 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp
index 43438a364..6c08d60d6 100644
--- a/lib/PTO/Transforms/PTOToEmitC.cpp
+++ b/lib/PTO/Transforms/PTOToEmitC.cpp
@@ -406,6 +406,7 @@ static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc,
 static Value castSignlessIntToUnsignedSameWidth(ConversionPatternRewriter &rewriter,
                                                 Location loc, Value v,
                                                 unsigned bitWidth);
+static bool needsA5NoSplitVectorGuard(Operation *op);
 
 static FailureOr<std::string> getTileSplitToken(int64_t split) {
   switch (split) {
@@ -2517,6 +2518,9 @@ struct FuncToEmitC : public OpConversionPattern<func::FuncOp> {
       emitcFunc.setSpecifiersAttr(rewriter.getStrArrayAttr({"AICORE"}));
     }
 
+    std::optional<StringRef> kernelKindMacro = getKernelKindMacro(op);
+    bool needsNoSplitGuard = needsA5NoSplitVectorGuard(op.getOperation());
+
     // Inline the original body, then convert region/block argument types to
     // match the converted signature (also covers CFG blocks introduced by
     // pre-lowering, e.g. scf.while -> cf.br/cf.cond_br).
@@ -2531,8 +2535,6 @@ struct FuncToEmitC : public OpConversionPattern<func::FuncOp> {
                                            *getTypeConverter(), &entryConv)))
       return failure();
 
-    std::optional<StringRef> kernelKindMacro = getKernelKindMacro(op);
-
     // Preserve the existing function prologue shape. `kernel_kind` functions are
     // emitted with the same macro guard/reset sequence that used to come from
     // early pto.section wrapping, but only after SCF pre-lowering has finished.
@@ -2547,6 +2549,9 @@ struct FuncToEmitC : public OpConversionPattern<func::FuncOp> {
           rewriter.create<emitc::VerbatimOp>(op.getLoc(), "set_mask_norm();");
           rewriter.create<emitc::VerbatimOp>(op.getLoc(),
                                              "set_vector_mask(-1, -1);");
+          if (needsNoSplitGuard)
+            rewriter.create<emitc::VerbatimOp>(
+                op.getLoc(), "if (get_subblockid() == 0) {");
         }
       }
     }
@@ -2554,6 +2559,8 @@ struct FuncToEmitC : public OpConversionPattern<func::FuncOp> {
     if (kernelKindMacro) {
       Block &lastBlock = emitcFunc.getBody().back();
       rewriter.setInsertionPoint(lastBlock.getTerminator());
+      if (*kernelKindMacro == "__DAV_VEC__" && needsNoSplitGuard)
+        rewriter.create<emitc::VerbatimOp>(op.getLoc(), "}");
       std::string endMacro = "#endif // " + kernelKindMacro->str() + "\n";
       rewriter.create<emitc::VerbatimOp>(op.getLoc(), endMacro);
     }
@@ -8956,6 +8963,68 @@ class ArithCmpIToEmitC : public OpConversionPattern<arith::CmpIOp> {
 //===----------------------------------------------------------------------===//
 // Section Op Lowering
 //===----------------------------------------------------------------------===//
+static bool isA5NoSplitPipeOp(Operation *op) {
+  if (auto tpush = dyn_cast<pto::TPushOp>(op))
+    return tpush.getSplit() == 0;
+  if (auto tpop = dyn_cast<pto::TPopOp>(op))
+    return tpop.getSplit() == 0;
+  if (auto tfree = dyn_cast<pto::TFreeOp>(op))
+    return tfree.getSplit() == 0;
+  if (auto tpush = dyn_cast<pto::TPushToAivOp>(op))
+    return tpush.getSplit() == 0;
+  if (auto tpush = dyn_cast<pto::TPushToAicOp>(op))
+    return tpush.getSplit() == 0;
+  if (auto tpop = dyn_cast<pto::TPopFromAicOp>(op))
+    return tpop.getSplit() == 0;
+  if (auto tpop = dyn_cast<pto::TPopFromAivOp>(op))
+    return tpop.getSplit() == 0;
+  if (auto tfree = dyn_cast<pto::TFreeFromAicOp>(op))
+    return tfree.getSplit() == 0;
+  if (auto tfree = dyn_cast<pto::TFreeFromAivOp>(op))
+    return tfree.getSplit() == 0;
+  return false;
+}
+
+static bool hasExplicitSubblockControl(Operation *op) {
+  bool hasControl = false;
+  op->walk([&](Operation *nested) {
+    if (isa<pto::GetSubBlockIdxOp, pto::GetSubBlockNumOp>(nested)) {
+      hasControl = true;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return hasControl;
+}
+
+static bool needsA5NoSplitVectorGuard(Operation *op) {
+  auto arch = getTargetArch(op);
+  if (arch != PTOArch::A5)
+    return false;
+  bool isVectorScope = isa<pto::SectionVectorOp>(op);
+  if (auto func = dyn_cast<func::FuncOp>(op)) {
+    if (auto kernelKindAttr =
+            func->getAttrOfType<FunctionKernelKindAttr>(
+                FunctionKernelKindAttr::name)) {
+      isVectorScope =
+          kernelKindAttr.getKernelKind() == FunctionKernelKind::Vector;
+    }
+  }
+  if (!isVectorScope)
+    return false;
+  if (hasExplicitSubblockControl(op))
+    return false;
+
+  bool hasNoSplitPipe = false;
+  op->walk([&](Operation *nested) {
+    if (!isA5NoSplitPipeOp(nested))
+      return WalkResult::advance();
+    hasNoSplitPipe = true;
+    return WalkResult::interrupt();
+  });
+  return hasNoSplitPipe;
+}
+
 template <typename SectionOpTy>
 struct SectionToEmitC : public OpConversionPattern<SectionOpTy> {
   using OpConversionPattern<SectionOpTy>::OpConversionPattern;
@@ -8972,6 +9041,7 @@ struct SectionToEmitC : public OpConversionPattern<SectionOpTy> {
   matchAndRewrite(SectionOpTy op, typename SectionOpTy::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
+    bool needsNoSplitGuard = needsA5NoSplitVectorGuard(op.getOperation());
 
     std::string startMacro = "\n#if defined(" + getMacroName() + ")";
     rewriter.create<emitc::VerbatimOp>(loc, startMacro);
@@ -8984,11 +9054,19 @@ struct SectionToEmitC : public OpConversionPattern<SectionOpTy> {
       rewriter.create<emitc::VerbatimOp>(loc, "set_vector_mask(-1, -1);");
     }
 
+    if (needsNoSplitGuard) {
+      rewriter.create<emitc::VerbatimOp>(
+          loc, "if (get_subblockid() == 0) {");
+    }
+
     Block &innerBlock = op.getBody().front();
     if (!innerBlock.empty()) {
       rewriter.inlineBlockBefore(&innerBlock, op.getOperation(), ValueRange{});
     }
 
+    if (needsNoSplitGuard)
+      rewriter.create<emitc::VerbatimOp>(loc, "}");
+
     std::string endMacro = "#endif // " + getMacroName() + "\n";
     rewriter.create<emitc::VerbatimOp>(loc, endMacro);
 
diff --git a/test/basic/tpush_tpop_frontend_lowering_a5.pto b/test/basic/tpush_tpop_frontend_lowering_a5.pto
index 84e20b799..9787217f6 100644
--- a/test/basic/tpush_tpop_frontend_lowering_a5.pto
+++ b/test/basic/tpush_tpop_frontend_lowering_a5.pto
@@ -66,6 +66,7 @@ module {
 // A5: TFREE<TPipe<0, Direction::DIR_BOTH, 1024, 4>, TileSplitAxis::TILE_NO_SPLIT>(
 
 // A5-LABEL: AICORE void vector_kernel(
+// A5: if (get_subblockid() == 0) {
 // A5: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_BOTH, 1024, 4>(
 // A5: Tile<TileType::Vec, float, 16, 16, BLayout::RowMajor, 16, 16, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> {{v[0-9]+}};
 // A5: Tile<TileType::Vec, float, 16, 16, BLayout::ColMajor, 16, 16, SLayout::RowMajor, 512, PadValue::Null, CompactMode::Null> {{v[0-9]+}};
@@ -75,3 +76,4 @@ module {
 // A5: Tile<TileType::Vec, float, 16, 16, BLayout::RowMajor, 16, 16, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> {{v[0-9]+}};
 // A5: TNEG(
 // A5: TFREE<TPipe<0, Direction::DIR_BOTH, 1024, 4>, TileSplitAxis::TILE_NO_SPLIT>(
+// A5: }

From f0bb59deb2d8252322f5ef4bdf76e2d89cdf5c4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=89=A2=E5=A4=A7?= <laoda@laodadeMacBook-Air.local>
Date: Thu, 9 Apr 2026 16:59:15 +0800
Subject: [PATCH 14/16] test: allow qwen3 tilelet cases on a3

---
 .github/workflows/ci.yml            | 2 +-
 test/samples/Qwen3Tilelet/README.md | 2 +-
 test/samples/runop.sh               | 8 ++------
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5e7b3bdf1..7e11b5fbf 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -293,7 +293,7 @@ jobs:
           # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based
           # on SOC_VERSION to keep the remote validation portable.
           A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync"
-          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,qwen3_decode_layer_incore_1,qwen3_decode_layer_incore_2,qwen3_decode_layer_incore_10,qwen3_decode_layer_incore_13,qwen3_decode_layer_incore_14"
+          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5"
 
           sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')"
           is_a5=0
diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md
index b713c4e28..f3fbcdb95 100644
--- a/test/samples/Qwen3Tilelet/README.md
+++ b/test/samples/Qwen3Tilelet/README.md
@@ -2,7 +2,7 @@ Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_
 
 Scope:
 - compile-regression inputs for `ptoas`
-- A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS`
+- tilelet kernels that default to `--pto-arch a5 --pto-level=level3` in `runop.sh`, but can also be compiled on A3 when the caller overrides `PTOAS_FLAGS`
 
 Notes:
 - The source PyPTO program lowers to 20 `qwen3_decode_layer_incore_*.pto` fragments; this directory vendors the full emitted `.pto` set regenerated from the tilelet source with `BATCH_TILE=16`.
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index 8ac6921a7..4d0d38c81 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -154,8 +154,8 @@ process_one_dir() {
     use_ptobc_roundtrip=1
   fi
   # Qwen3 tilelet kernels currently serve as direct ptoas compile-regression
-  # coverage. They require A5/level3 lowering, but are not expected to
-  # roundtrip through ptobc yet.
+  # coverage. Default them to A5/level3 lowering when the caller does not
+  # provide an explicit arch, but let A3/A5 callers override PTOAS_FLAGS.
   if [[ "$A" == "Qwen3Tilelet" ]]; then
     use_ptobc_roundtrip=0
   fi
@@ -942,10 +942,6 @@ PY
       if [[ "$A" == "Qwen3Tilelet" ]]; then
         cpp="${out_subdir}/${base}-pto.cpp"
       fi
-      if [[ "$A" == "Qwen3Tilelet" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then
-        echo -e "${A}(${base}.pto)\tSKIP\trequires --pto-arch=a5"
-        continue
-      fi
       local sample_use_ptobc_roundtrip="$use_ptobc_roundtrip"
 
       # TODO(ptobc): decode of this regression currently fails with

From 4421585430bf4375949a5bfe91f961b29b409322 Mon Sep 17 00:00:00 2001
From: HecreReed <821896444@qq.com>
Date: Mon, 13 Apr 2026 11:49:31 +0800
Subject: [PATCH 15/16] test: mark qwen tilelet samples as a5-only

---
 .github/workflows/ci.yml | 23 ++++++++++++-----------
 test/samples/runop.sh    | 14 +++++++++++++-
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7e11b5fbf..e27c1c2dc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -288,12 +288,15 @@ jobs:
             fi
           fi
 
-          # Some validation samples have A3 vs A5 variants due to stricter
-          # pto-isa static checks on Ascend950 (A5). When running the full test
-          # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based
-          # on SOC_VERSION to keep the remote validation portable.
+          # Some validation samples are arch-specific due to stricter pto-isa
+          # static checks and A5-only tile layouts. Always skip the
+          # non-matching variant based on SOC_VERSION, even for explicit
+          # RUN_ONLY_CASES requests, so remote validation does not try to force
+          # A5-only cases through an A3 flow or vice versa.
           A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync"
-          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5"
+          QWEN3_TILELET_A5_ONLY_CASES="$(printf 'qwen3_decode_layer_incore_%s,' {0..19})"
+          QWEN3_TILELET_A5_ONLY_CASES="${QWEN3_TILELET_A5_ONLY_CASES%,}"
+          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,${QWEN3_TILELET_A5_ONLY_CASES}"
 
           sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')"
           is_a5=0
@@ -301,12 +304,10 @@ jobs:
             is_a5=1
           fi
 
-          if [[ -z "${RUN_ONLY_CASES}" ]]; then
-            if [[ ${is_a5} -eq 1 ]]; then
-              SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A3_ONLY_CASES}"
-            else
-              SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A5_ONLY_CASES}"
-            fi
+          if [[ ${is_a5} -eq 1 ]]; then
+            SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A3_ONLY_CASES}"
+          else
+            SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A5_ONLY_CASES}"
           fi
 
           echo "STAGE=${STAGE}" >> "${GITHUB_ENV}"
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index 4d0d38c81..866bcdd89 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -155,7 +155,8 @@ process_one_dir() {
   fi
   # Qwen3 tilelet kernels currently serve as direct ptoas compile-regression
   # coverage. Default them to A5/level3 lowering when the caller does not
-  # provide an explicit arch, but let A3/A5 callers override PTOAS_FLAGS.
+  # provide an explicit arch, and skip them entirely when the caller forces an
+  # A3 lowering path because the samples use A5-only matmul tile layouts.
   if [[ "$A" == "Qwen3Tilelet" ]]; then
     use_ptobc_roundtrip=0
   fi
@@ -221,6 +222,17 @@ process_one_dir() {
     echo -e "${A}\tSKIP\tMissing dir: $dir"
     return 0
   fi
+  if [[ "$A" == "Qwen3Tilelet" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then
+    local qwen_case
+    for qwen_case in "$dir"/*.pto; do
+      [[ -f "$qwen_case" ]] || continue
+      case "$qwen_case" in
+        *-pto-ir.pto) continue ;;
+      esac
+      echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires --pto-arch=a5"
+    done
+    return 0
+  fi
 
   # Run every .py file in this directory (no requirement that name matches folder).
   local f mlir ptobc_file decoded_pto cpp base overall=0

From b5c7fc455f12ad009edddac5f4ad33d65fdca113 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=89=A2=E5=A4=A7?= <laoda@laodadeMacBook-Air.local>
Date: Mon, 13 Apr 2026 15:02:00 +0800
Subject: [PATCH 16/16] test: preserve qwen tilelet level3 under arch overrides

---
 test/samples/runop.sh | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index 866bcdd89..fde9e29bb 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -180,6 +180,7 @@ process_one_dir() {
 
   local target_arch="a3"
   local has_pto_arch_override=0
+  local has_pto_level_override=0
   if ((${#ptoas_flags[@]})); then
     for ((idx=0; idx<${#ptoas_flags[@]}; ++idx)); do
       if [[ "${ptoas_flags[idx]}" == "--pto-arch" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then
@@ -188,12 +189,21 @@ process_one_dir() {
       elif [[ "${ptoas_flags[idx]}" == --pto-arch=* ]]; then
         target_arch="${ptoas_flags[idx]#--pto-arch=}"
         has_pto_arch_override=1
+      elif [[ "${ptoas_flags[idx]}" == "--pto-level" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then
+        has_pto_level_override=1
+      elif [[ "${ptoas_flags[idx]}" == --pto-level=* ]]; then
+        has_pto_level_override=1
       fi
     done
   fi
-  if [[ "$A" == "Qwen3Tilelet" && $has_pto_arch_override -eq 0 ]]; then
-    ptoas_flags+=(--pto-arch a5 --pto-level=level3)
-    target_arch="a5"
+  if [[ "$A" == "Qwen3Tilelet" ]]; then
+    if [[ $has_pto_arch_override -eq 0 ]]; then
+      ptoas_flags+=(--pto-arch a5)
+      target_arch="a5"
+    fi
+    if [[ $has_pto_level_override -eq 0 ]]; then
+      ptoas_flags+=(--pto-level=level3)
+    fi
   fi
   local expected_vec_barrier="pipe_barrier(PIPE_V)"
   local skip_vec_barrier=0