From 18caf5fd579de19af037c6635bed9a0bdaf91c8a Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 2 Apr 2026 17:01:26 +0800 Subject: [PATCH 01/16] test: add qwen3 scope2 pto kernels --- .github/workflows/ci.yml | 11 +- test/samples/Qwen3Scope2/README.md | 11 ++ .../Qwen3Scope2/decode_attention_incore_0.pto | 25 ++++ .../Qwen3Scope2/decode_attention_incore_1.pto | 58 +++++++++ .../decode_attention_incore_10.pto | 30 +++++ .../decode_attention_incore_11.pto | 111 ++++++++++++++++++ .../decode_attention_incore_12.pto | 28 +++++ .../Qwen3Scope2/decode_attention_incore_2.pto | 39 ++++++ .../Qwen3Scope2/decode_attention_incore_3.pto | 26 ++++ .../Qwen3Scope2/decode_attention_incore_4.pto | 64 ++++++++++ .../Qwen3Scope2/decode_attention_incore_5.pto | 30 +++++ .../Qwen3Scope2/decode_attention_incore_6.pto | 18 +++ .../Qwen3Scope2/decode_attention_incore_7.pto | 30 +++++ .../Qwen3Scope2/decode_attention_incore_8.pto | 49 ++++++++ .../Qwen3Scope2/decode_attention_incore_9.pto | 18 +++ test/samples/runop.sh | 11 +- 16 files changed, 555 insertions(+), 4 deletions(-) create mode 100644 test/samples/Qwen3Scope2/README.md create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_0.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_1.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_10.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_11.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_12.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_2.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_3.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_4.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_5.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_6.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_7.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_8.pto create mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_9.pto diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb4749ec0..01ab24d35 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ on: skip_cases: description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)" type: string - default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp" + default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12" run_only_cases: description: "Comma/space separated testcase names to run (empty = run all)" type: string @@ -261,7 +261,14 @@ jobs: # Temporary CI gate: skip cases that still error/flap on the remote NPU. # Update this list as we fix the underlying issues. DEFAULT_SKIP_CASES: >- - mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp + mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp, + decode_attention_incore_0,decode_attention_incore_1, + decode_attention_incore_2,decode_attention_incore_3, + decode_attention_incore_4,decode_attention_incore_5, + decode_attention_incore_6,decode_attention_incore_7, + decode_attention_incore_8,decode_attention_incore_9, + decode_attention_incore_10,decode_attention_incore_11, + decode_attention_incore_12 steps: - name: Resolve validation parameters shell: bash diff --git a/test/samples/Qwen3Scope2/README.md b/test/samples/Qwen3Scope2/README.md new file mode 100644 index 000000000..978e54ad0 --- /dev/null +++ b/test/samples/Qwen3Scope2/README.md @@ -0,0 +1,11 @@ +Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`. + +Scope: +- compile-regression inputs for `ptoas` +- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS` + +Notes: +- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file. +- This sample directory vendors only the kernel `.pto` inputs. +- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files. +- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed. diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto new file mode 100644 index 000000000..d9df6b9eb --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto @@ -0,0 +1,25 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_0(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { + %1 = arith.muli %ki__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%0 : !pto.tile_buf) + %k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tstore ins(%0 : !pto.tile_buf) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>) + } + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto new file mode 100644 index 000000000..d79076b4f --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto @@ -0,0 +1,58 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c512 = arith.constant 512 : i64 + %c768 = arith.constant 768 : i64 + %c1024 = arith.constant 1024 : i64 + %c3072 = arith.constant 3072 : i64 + %c5120 = arith.constant 5120 : i64 + %c7168 = arith.constant 7168 : i64 + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) + %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) + %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf + %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) + %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf + %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) + %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf) + %k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf + %3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf + pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf + pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf + pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%rot_lo__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf + pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%rot_hi__tile : !pto.tile_buf) + %k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tstore ins(%rot_lo__tile : !pto.tile_buf) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>) + %k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tstore ins(%rot_hi__tile : !pto.tile_buf) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto new file mode 100644 index 000000000..142c570b8 --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto @@ -0,0 +1,30 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_10(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> + pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%v_tile__tile : !pto.tile_buf) outs(%v_tile__tile_Right : !pto.tile_buf) + %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%oi_tmp_pad__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto new file mode 100644 index 000000000..17eae5c2b --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto @@ -0,0 +1,111 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_11(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c32 = arith.constant 32 : i64 + %c64 = arith.constant 64 : i64 + %c96 = arith.constant 96 : i64 + %c128 = arith.constant 128 : i64 + %c4224 = arith.constant 4224 : i64 + %c8320 = arith.constant 8320 : i64 + %c8352 = arith.constant 8352 : i64 + %c8384 = arith.constant 8384 : i64 + %c8416 = arith.constant 8416 : i64 + %c8448 = arith.constant 8448 : i64 + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf + %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %arg6, %c0 : index + %li__phi_v5, %mi__phi_v5, %oi__phi_v5 = scf.if %8 -> (!pto.tensor_view, !pto.tensor_view, !pto.tensor_view) { + %oi__ssa_v3 = pto.alloc_tile addr = %c4224 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<8x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<8x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%oi_tmp__tile : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<8x128xf32>) + scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view, !pto.tensor_view, !pto.tensor_view + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c8352 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c8352 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c8352 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c8384 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c8384 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c8384 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c8384 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c8384 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c8352 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c8416 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c8416 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c8384 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c8416 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c8416 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c8416 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %13 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%3 : !pto.tile_buf) outs(%13 : !pto.partition_tensor_view<8x1xf32>) + %15 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%mi_new__tile : !pto.tile_buf) outs(%15 : !pto.partition_tensor_view<8x1xf32>) + %17 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%6 : !pto.tile_buf) outs(%17 : !pto.partition_tensor_view<8x128xf32>) + scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view, !pto.tensor_view, !pto.tensor_view + } + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto new file mode 100644 index 000000000..124078522 --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto @@ -0,0 +1,28 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_12(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c32 = arith.constant 32 : i64 + %c1 = arith.constant 1 : index + %c8192 = arith.constant 8192 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) + %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf, !pto.tile_buf) outs(%ctx__tile : !pto.tile_buf) + %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %0 = arith.muli %arg3, %c128 : index + %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view -> !pto.partition_tensor_view<1x1024xf32> + pto.tstore ins(%ctx_flat__tile : !pto.tile_buf) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xf32>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto new file mode 100644 index 000000000..5419f419a --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto @@ -0,0 +1,39 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_2(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c512 = arith.constant 512 : i64 + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c4096 = arith.constant 4096 : index + %k_cache__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_rot_tensor__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { + %2 = arith.muli %arg4, %c8 : index + %3 = arith.muli %2, %c4096 : index + %4 = arith.muli %ki__idx_v0, %c4096 : index + %5 = arith.addi %3, %4 : index + %6 = arith.addi %5, %arg5 : index + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %k_rot_tensor__ssa_v2_pview = pto.partition_view %k_rot_tensor__ssa_v2_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%k_rot_tensor__ssa_v2_pview : !pto.partition_tensor_view<1x128xf32>) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c512 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%0 : !pto.tile_buf) + %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tstore ins(%0 : !pto.tile_buf) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) + %1 = pto.alloc_tile addr = %c512 : !pto.tile_buf + %7 = arith.muli %ki__idx_v0, %c128 : index + %v_proj__ssa_v0_pview = pto.partition_view %v_proj__ssa_v0_view, offsets = [%arg4, %7], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tload ins(%v_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%1 : !pto.tile_buf) + %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tstore ins(%1 : !pto.tile_buf) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto new file mode 100644 index 000000000..143c98a4b --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto @@ -0,0 +1,26 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_3(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index + %q_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + scf.for %qi__idx_v0 = %c0 to %c8 step %c1 { + %1 = arith.addi %arg3, %qi__idx_v0 : index + %2 = arith.muli %1, %c128 : index + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %q_proj__ssa_v0_pview = pto.partition_view %q_proj__ssa_v0_view, offsets = [%arg2, %2], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tload ins(%q_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%0 : !pto.tile_buf) + %q_group__iter_v1_pview = pto.partition_view %q_group__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tstore ins(%0 : !pto.tile_buf) outs(%q_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>) + } + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto new file mode 100644 index 000000000..9de52a73c --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto @@ -0,0 +1,64 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_4(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c512 = arith.constant 512 : i64 + %c768 = arith.constant 768 : i64 + %c1024 = arith.constant 1024 : i64 + %c3072 = arith.constant 3072 : i64 + %c5120 = arith.constant 5120 : i64 + %c7168 = arith.constant 7168 : i64 + %c9216 = arith.constant 9216 : i64 + %c10240 = arith.constant 10240 : i64 + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_rot_bf16__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) + %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) + %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf + %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) + %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf + %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) + %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %q_group__rv_v2_pview = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%q_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%q_lo__tile : !pto.tile_buf) + %q_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf + %3 = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%q_hi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf + pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf + pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %q_rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf + pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%q_rot_lo__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf + pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %q_rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%q_rot_hi__tile : !pto.tile_buf) + %q_rot_lo_bf16__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf + pto.tcvt ins(%q_rot_lo__tile{rmode = #pto} : !pto.tile_buf) outs(%q_rot_lo_bf16__tile : !pto.tile_buf) + %q_rot_hi_bf16__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf + pto.tcvt ins(%q_rot_hi__tile{rmode = #pto} : !pto.tile_buf) outs(%q_rot_hi_bf16__tile : !pto.tile_buf) + %q_rot_bf16__ssa_v0_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tstore ins(%q_rot_lo_bf16__tile : !pto.tile_buf) outs(%q_rot_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) + %q_rot_bf16__tile_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tstore ins(%q_rot_hi_bf16__tile : !pto.tile_buf) outs(%q_rot_bf16__tile_pview : !pto.partition_tensor_view<8x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto new file mode 100644 index 000000000..28ad1932e --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto @@ -0,0 +1,30 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_5(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c4128 = arith.constant 4128 : i64 + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %ret0__out_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret2__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf) + %li_flat__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %mi_flat__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%oi__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto new file mode 100644 index 000000000..26e9555c4 --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto @@ -0,0 +1,18 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_6(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %q_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_rot_bf16__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_bf16_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %q_rot_bf16__ssa_v2_pview = pto.partition_view %q_rot_bf16__ssa_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xbf16> + pto.tload ins(%q_rot_bf16__ssa_v2_pview : !pto.partition_tensor_view<8x128xbf16>) outs(%q_bf16_tile__tile : !pto.tile_buf) + %q_padded__ssa_v0_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xbf16> + pto.tstore ins(%q_bf16_tile__tile : !pto.tile_buf) outs(%q_padded__ssa_v0_pview : !pto.partition_tensor_view<8x128xbf16>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto new file mode 100644 index 000000000..282f797e5 --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto @@ -0,0 +1,30 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout}: !pto.tensor_view + %q_padded__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %q_padded__ssa_v1_pview = pto.partition_view %q_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%q_padded__ssa_v1_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%k_tile__tile : !pto.tile_buf) outs(%k_tile__tile_Right : !pto.tile_buf) + %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%raw_scores_pad__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto new file mode 100644 index 000000000..f968b1627 --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto @@ -0,0 +1,49 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_8(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c2048 = arith.constant 2048 : i64 + %c4096 = arith.constant 4096 : i64 + %c8192 = arith.constant 8192 : i64 + %c8224 = arith.constant 8224 : i64 + %c9248 = arith.constant 9248 : i64 + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 8.838835e-02 : f32 + %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret2__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf + %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf) + pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf + %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf) outs(%scores_padded__tile : !pto.tile_buf) + %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf, f32) outs(%scores__tile : !pto.tile_buf) + %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%cur_mi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%exp_scores__tile : !pto.tile_buf) + %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf + pto.tcvt ins(%exp_scores__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_bf16__tile : !pto.tile_buf) + %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_fp32__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf + pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%cur_li__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto new file mode 100644 index 000000000..0c16cfc61 --- /dev/null +++ b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto @@ -0,0 +1,18 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_attention_incore_9(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %exp_scores_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %exp_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %exp_scores_bf16__ssa_v0_pview = pto.partition_view %exp_scores_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tload ins(%exp_scores_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) outs(%exp_tile__tile : !pto.tile_buf) + %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tstore ins(%exp_tile__tile : !pto.tile_buf) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) + return + } +} diff --git a/test/samples/runop.sh b/test/samples/runop.sh index a57efc8a6..a3839b029 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -19,7 +19,7 @@ PYTHON_BIN="${PYTHON_BIN:-}" PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}" PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}" PTOAS_FLAGS="${PTOAS_FLAGS:-}" -PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync}" +PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Scope2}" ENABLE_BC=0 usage() { @@ -36,7 +36,7 @@ Env: PTOAS_OUT_DIR # where generated *.mlir/*.cpp go (optional; defaults to a temp dir) PTOAS_FLAGS # extra flags passed to ptoas (e.g. --enable-insert-sync) PTOAS_ENABLE_INSERT_SYNC # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1) - PTO_PTO_DIRS # space-separated dirs to run .pto directly (default: Sync) + PTO_PTO_DIRS # space-separated dirs to run .pto directly (default: Sync Qwen3Scope2) Flags: --enablebc # enable: python -> .pto -> ptobc -> .pto -> ptoas @@ -172,15 +172,22 @@ process_one_dir() { fi local target_arch="a3" + local has_pto_arch_override=0 if ((${#ptoas_flags[@]})); then for ((idx=0; idx<${#ptoas_flags[@]}; ++idx)); do if [[ "${ptoas_flags[idx]}" == "--pto-arch" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then target_arch="${ptoas_flags[idx + 1]}" + has_pto_arch_override=1 elif [[ "${ptoas_flags[idx]}" == --pto-arch=* ]]; then target_arch="${ptoas_flags[idx]#--pto-arch=}" + has_pto_arch_override=1 fi done fi + if [[ "$A" == "Qwen3Scope2" && $has_pto_arch_override -eq 0 ]]; then + ptoas_flags+=(--pto-arch a5 --pto-level=level3) + target_arch="a5" + fi local expected_vec_barrier="pipe_barrier(PIPE_V)" local skip_vec_barrier=0 if [[ "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" == "a5" ]]; then From 45dbf6790cf89059981cfd5330eee588addaf539 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 2 Apr 2026 19:27:17 +0800 Subject: [PATCH 02/16] fix(test): skip ptobc roundtrip for qwen scope2 --- test/samples/runop.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index a3839b029..b8c02ff00 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -153,6 +153,12 @@ process_one_dir() { if [[ "${ENABLE_BC}" == "1" ]]; then use_ptobc_roundtrip=1 fi + # Qwen3 scope2 kernels currently serve as direct ptoas compile-regression + # coverage. They require A5/level3 lowering, but are not expected to + # roundtrip through ptobc yet. + if [[ "$A" == "Qwen3Scope2" ]]; then + use_ptobc_roundtrip=0 + fi local -a ptoas_flags=() if [[ -n "${PTOAS_FLAGS}" ]]; then # shellcheck disable=SC2206 @@ -910,7 +916,6 @@ PY if [[ "$base" == "test_if_else_tile_result" ]]; then sample_use_ptobc_roundtrip=0 fi - if [[ $sample_use_ptobc_roundtrip -eq 1 ]]; then # Allow generic escape for ops that are not yet in the compact v0 opcode table. if ! PTOBC_ALLOW_GENERIC=1 "$ptobc" encode "$f" -o "$ptobc_file" >/dev/null 2>&1; then From 14b4c236f3ff74e824d4ec7a5b1c06df909b8533 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 09:57:11 +0800 Subject: [PATCH 03/16] test: switch qwen PTO samples to tilelet --- test/samples/Qwen3Scope2/README.md | 11 -- .../Qwen3Scope2/decode_attention_incore_0.pto | 25 --- .../Qwen3Scope2/decode_attention_incore_1.pto | 58 ------- .../decode_attention_incore_10.pto | 30 ---- .../decode_attention_incore_11.pto | 111 ------------- .../decode_attention_incore_12.pto | 28 ---- .../Qwen3Scope2/decode_attention_incore_2.pto | 39 ----- .../Qwen3Scope2/decode_attention_incore_3.pto | 26 --- .../Qwen3Scope2/decode_attention_incore_4.pto | 64 -------- .../Qwen3Scope2/decode_attention_incore_5.pto | 30 ---- .../Qwen3Scope2/decode_attention_incore_6.pto | 18 --- .../Qwen3Scope2/decode_attention_incore_7.pto | 30 ---- .../Qwen3Scope2/decode_attention_incore_8.pto | 49 ------ .../Qwen3Scope2/decode_attention_incore_9.pto | 18 --- test/samples/Qwen3Tilelet/README.md | 13 ++ .../qwen3_decode_layer_incore_1.pto | 116 ++++++++++++++ .../qwen3_decode_layer_incore_10.pto | 108 +++++++++++++ .../qwen3_decode_layer_incore_13.pto | 116 ++++++++++++++ .../qwen3_decode_layer_incore_14.pto | 73 +++++++++ .../qwen3_decode_layer_incore_2.pto | 148 ++++++++++++++++++ test/samples/runop.sh | 17 +- 21 files changed, 586 insertions(+), 542 deletions(-) delete mode 100644 test/samples/Qwen3Scope2/README.md delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_0.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_1.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_10.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_11.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_12.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_2.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_3.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_4.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_5.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_6.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_7.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_8.pto delete mode 100644 test/samples/Qwen3Scope2/decode_attention_incore_9.pto create mode 100644 test/samples/Qwen3Tilelet/README.md create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto diff --git a/test/samples/Qwen3Scope2/README.md b/test/samples/Qwen3Scope2/README.md deleted file mode 100644 index 978e54ad0..000000000 --- a/test/samples/Qwen3Scope2/README.md +++ /dev/null @@ -1,11 +0,0 @@ -Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`. - -Scope: -- compile-regression inputs for `ptoas` -- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS` - -Notes: -- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file. -- This sample directory vendors only the kernel `.pto` inputs. -- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files. -- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed. diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto deleted file mode 100644 index d9df6b9eb..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto +++ /dev/null @@ -1,25 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_0(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c256 = arith.constant 256 : i64 - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { - %1 = arith.muli %ki__idx_v0, %c128 : index - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> - pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%0 : !pto.tile_buf) - %k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tstore ins(%0 : !pto.tile_buf) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>) - } - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto deleted file mode 100644 index d79076b4f..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto +++ /dev/null @@ -1,58 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c256 = arith.constant 256 : i64 - %c512 = arith.constant 512 : i64 - %c768 = arith.constant 768 : i64 - %c1024 = arith.constant 1024 : i64 - %c3072 = arith.constant 3072 : i64 - %c5120 = arith.constant 5120 : i64 - %c7168 = arith.constant 7168 : i64 - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) - %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf - %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) - %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf - %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) - %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf - %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) - %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - %k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf) - %k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf - %3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf - pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf - pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf - pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%rot_lo__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf - pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%rot_hi__tile : !pto.tile_buf) - %k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tstore ins(%rot_lo__tile : !pto.tile_buf) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>) - %k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tstore ins(%rot_hi__tile : !pto.tile_buf) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto deleted file mode 100644 index 142c570b8..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto +++ /dev/null @@ -1,30 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_10(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c16384 = arith.constant 16384 : i64 - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %c524288 = arith.constant 524288 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> - pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf) - %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> - pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) - %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) - %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%v_tile__tile : !pto.tile_buf) outs(%v_tile__tile_Right : !pto.tile_buf) - %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%oi_tmp_pad__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto deleted file mode 100644 index 17eae5c2b..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto +++ /dev/null @@ -1,111 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_11(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c32 = arith.constant 32 : i64 - %c64 = arith.constant 64 : i64 - %c96 = arith.constant 96 : i64 - %c128 = arith.constant 128 : i64 - %c4224 = arith.constant 4224 : i64 - %c8320 = arith.constant 8320 : i64 - %c8352 = arith.constant 8352 : i64 - %c8384 = arith.constant 8384 : i64 - %c8416 = arith.constant 8416 : i64 - %c8448 = arith.constant 8448 : i64 - %c8 = arith.constant 8 : index - %c1 = arith.constant 1 : index - %7 = arith.constant 128 : index - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf) - %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) - %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) - %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf - %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf) - %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf - %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) - %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf - %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) - %8 = arith.cmpi eq, %arg6, %c0 : index - %li__phi_v5, %mi__phi_v5, %oi__phi_v5 = scf.if %8 -> (!pto.tensor_view, !pto.tensor_view, !pto.tensor_view) { - %oi__ssa_v3 = pto.alloc_tile addr = %c4224 : !pto.tile_buf - %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf - %mi__ssa_v3 = pto.alloc_tile addr = %c32 : !pto.tile_buf - %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<8x1xf32>) - %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<8x1xf32>) - %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tstore ins(%oi_tmp__tile : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<8x128xf32>) - scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view, !pto.tensor_view, !pto.tensor_view - } else { - %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf - %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf - %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c8320 : !pto.tile_buf - pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) - %mi_new__tile = pto.alloc_tile addr = %c8320 : !pto.tile_buf - %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf - %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf - %t__row_major_tmp_v5 = pto.alloc_tile addr = %c8352 : !pto.tile_buf - pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf - %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c8352 : !pto.tile_buf - %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c8352 : !pto.tile_buf - pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) - %alpha__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf - %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf - %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c8320 : !pto.tile_buf - %t__row_major_tmp_v10 = pto.alloc_tile addr = %c8384 : !pto.tile_buf - pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c8384 : !pto.tile_buf - %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c8384 : !pto.tile_buf - %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c8384 : !pto.tile_buf - pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) - %beta__tile = pto.alloc_tile addr = %c8384 : !pto.tile_buf - %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c8352 : !pto.tile_buf - %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf - %t__row_major_tmp_v15 = pto.alloc_tile addr = %c8416 : !pto.tile_buf - pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c8416 : !pto.tile_buf - %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c8384 : !pto.tile_buf - %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf - %t__row_major_tmp_v18 = pto.alloc_tile addr = %c8448 : !pto.tile_buf - pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf - %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c8416 : !pto.tile_buf - %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c8448 : !pto.tile_buf - %li__row_major_tmp_v21 = pto.alloc_tile addr = %c8416 : !pto.tile_buf - pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) - %3 = pto.alloc_tile addr = %c8416 : !pto.tile_buf - %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf - pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) - %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf - pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) - %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf - pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) - %mi__ssa_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf - %13 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%3 : !pto.tile_buf) outs(%13 : !pto.partition_tensor_view<8x1xf32>) - %15 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%mi_new__tile : !pto.tile_buf) outs(%15 : !pto.partition_tensor_view<8x1xf32>) - %17 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tstore ins(%6 : !pto.tile_buf) outs(%17 : !pto.partition_tensor_view<8x128xf32>) - scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view, !pto.tensor_view, !pto.tensor_view - } - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto deleted file mode 100644 index 124078522..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto +++ /dev/null @@ -1,28 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_12(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c32 = arith.constant 32 : i64 - %c1 = arith.constant 1 : index - %c8192 = arith.constant 8192 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) - %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) - %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf, !pto.tile_buf) outs(%ctx__tile : !pto.tile_buf) - %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - %0 = arith.muli %arg3, %c128 : index - %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view -> !pto.partition_tensor_view<1x1024xf32> - pto.tstore ins(%ctx_flat__tile : !pto.tile_buf) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xf32>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto deleted file mode 100644 index 5419f419a..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto +++ /dev/null @@ -1,39 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_2(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c512 = arith.constant 512 : i64 - %c524288 = arith.constant 524288 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c16 = arith.constant 16 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %c4096 = arith.constant 4096 : index - %k_cache__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_rot_tensor__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_proj__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { - %2 = arith.muli %arg4, %c8 : index - %3 = arith.muli %2, %c4096 : index - %4 = arith.muli %ki__idx_v0, %c4096 : index - %5 = arith.addi %3, %4 : index - %6 = arith.addi %5, %arg5 : index - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %k_rot_tensor__ssa_v2_pview = pto.partition_view %k_rot_tensor__ssa_v2_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tload ins(%k_rot_tensor__ssa_v2_pview : !pto.partition_tensor_view<1x128xf32>) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c512 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%0 : !pto.tile_buf) - %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> - pto.tstore ins(%0 : !pto.tile_buf) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) - %1 = pto.alloc_tile addr = %c512 : !pto.tile_buf - %7 = arith.muli %ki__idx_v0, %c128 : index - %v_proj__ssa_v0_pview = pto.partition_view %v_proj__ssa_v0_view, offsets = [%arg4, %7], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> - pto.tload ins(%v_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%1 : !pto.tile_buf) - %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> - pto.tstore ins(%1 : !pto.tile_buf) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) - } - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto deleted file mode 100644 index 143c98a4b..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto +++ /dev/null @@ -1,26 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_3(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c256 = arith.constant 256 : i64 - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c0 = arith.constant 0 : index - %q_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - scf.for %qi__idx_v0 = %c0 to %c8 step %c1 { - %1 = arith.addi %arg3, %qi__idx_v0 : index - %2 = arith.muli %1, %c128 : index - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %q_proj__ssa_v0_pview = pto.partition_view %q_proj__ssa_v0_view, offsets = [%arg2, %2], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> - pto.tload ins(%q_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%0 : !pto.tile_buf) - %q_group__iter_v1_pview = pto.partition_view %q_group__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tstore ins(%0 : !pto.tile_buf) outs(%q_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>) - } - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto deleted file mode 100644 index 9de52a73c..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto +++ /dev/null @@ -1,64 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_4(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c256 = arith.constant 256 : i64 - %c512 = arith.constant 512 : i64 - %c768 = arith.constant 768 : i64 - %c1024 = arith.constant 1024 : i64 - %c3072 = arith.constant 3072 : i64 - %c5120 = arith.constant 5120 : i64 - %c7168 = arith.constant 7168 : i64 - %c9216 = arith.constant 9216 : i64 - %c10240 = arith.constant 10240 : i64 - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_rot_bf16__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) - %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf - %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) - %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf - %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) - %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf - %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) - %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - %q_group__rv_v2_pview = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tload ins(%q_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%q_lo__tile : !pto.tile_buf) - %q_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf - %3 = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%q_hi__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf - pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf - pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %q_rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf - pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%q_rot_lo__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf - pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - %q_rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%q_rot_hi__tile : !pto.tile_buf) - %q_rot_lo_bf16__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf - pto.tcvt ins(%q_rot_lo__tile{rmode = #pto} : !pto.tile_buf) outs(%q_rot_lo_bf16__tile : !pto.tile_buf) - %q_rot_hi_bf16__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf - pto.tcvt ins(%q_rot_hi__tile{rmode = #pto} : !pto.tile_buf) outs(%q_rot_hi_bf16__tile : !pto.tile_buf) - %q_rot_bf16__ssa_v0_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> - pto.tstore ins(%q_rot_lo_bf16__tile : !pto.tile_buf) outs(%q_rot_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) - %q_rot_bf16__tile_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> - pto.tstore ins(%q_rot_hi_bf16__tile : !pto.tile_buf) outs(%q_rot_bf16__tile_pview : !pto.partition_tensor_view<8x64xbf16>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto deleted file mode 100644 index 28ad1932e..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto +++ /dev/null @@ -1,30 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_5(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 - %c4128 = arith.constant 4128 : i64 - %c8 = arith.constant 8 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %ret0__out_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret1__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret2__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %oi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf) - %li_flat__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf) - %li__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %mi_flat__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf) - %mi__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tstore ins(%oi__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto deleted file mode 100644 index 26e9555c4..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto +++ /dev/null @@ -1,18 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_6(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c16 = arith.constant 16 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - %q_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_rot_bf16__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_bf16_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %q_rot_bf16__ssa_v2_pview = pto.partition_view %q_rot_bf16__ssa_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xbf16> - pto.tload ins(%q_rot_bf16__ssa_v2_pview : !pto.partition_tensor_view<8x128xbf16>) outs(%q_bf16_tile__tile : !pto.tile_buf) - %q_padded__ssa_v0_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xbf16> - pto.tstore ins(%q_bf16_tile__tile : !pto.tile_buf) outs(%q_padded__ssa_v0_pview : !pto.partition_tensor_view<8x128xbf16>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto deleted file mode 100644 index 282f797e5..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto +++ /dev/null @@ -1,30 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c16384 = arith.constant 16384 : i64 - %c524288 = arith.constant 524288 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout}: !pto.tensor_view - %q_padded__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf) - %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %q_padded__ssa_v1_pview = pto.partition_view %q_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%q_padded__ssa_v1_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf) - %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) - %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%k_tile__tile : !pto.tile_buf) outs(%k_tile__tile_Right : !pto.tile_buf) - %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%raw_scores_pad__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto deleted file mode 100644 index f968b1627..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_8(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c2048 = arith.constant 2048 : i64 - %c4096 = arith.constant 4096 : i64 - %c8192 = arith.constant 8192 : i64 - %c8224 = arith.constant 8224 : i64 - %c9248 = arith.constant 9248 : i64 - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 8.838835e-02 : f32 - %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret1__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret2__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf - %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf) - pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf - %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf) outs(%scores_padded__tile : !pto.tile_buf) - %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf, f32) outs(%scores__tile : !pto.tile_buf) - %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%cur_mi__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.texp ins(%t__tile : !pto.tile_buf) outs(%exp_scores__tile : !pto.tile_buf) - %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf - pto.tcvt ins(%exp_scores__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_bf16__tile : !pto.tile_buf) - %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_fp32__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf - pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%cur_li__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> - pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x64xbf16>) - return - } -} diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto deleted file mode 100644 index 0c16cfc61..000000000 --- a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto +++ /dev/null @@ -1,18 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @decode_attention_incore_9(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %exp_scores_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %exp_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %exp_scores_bf16__ssa_v0_pview = pto.partition_view %exp_scores_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> - pto.tload ins(%exp_scores_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) outs(%exp_tile__tile : !pto.tile_buf) - %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> - pto.tstore ins(%exp_tile__tile : !pto.tile_buf) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md new file mode 100644 index 000000000..34e3a51bf --- /dev/null +++ b/test/samples/Qwen3Tilelet/README.md @@ -0,0 +1,13 @@ +Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_tilelet.py`. + +Scope: +- compile-regression inputs for `ptoas` +- A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS` + +Notes: +- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs: + `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`, + `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`, + `qwen3_decode_layer_incore_14`. +- This sample directory vendors only those direct `ptoas` regression inputs. +- No custom `golden.py` or `compare.py` is included here: these grouped mixed kernels depend on orchestration-managed peer buffers and loop-carried context, so per-kernel numerical validation is not a drop-in replacement for the full PyPTO runtime flow. diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto new file mode 100644 index 000000000..77b0b5c33 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto @@ -0,0 +1,116 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_1_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c4096 = arith.constant 4096 : i64 + %c0i = arith.constant 0 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view + %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_1_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 + %qwen3_decode_layer_incore_1_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg6, %c4 : index + %1 = arith.addi %0, %ob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %5 = arith.muli %kb__idx_v0, %c128 : index + %wq_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wq_chunk__tile : !pto.tile_buf) + %t__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf + %t__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%t__tile_Left_mat : !pto.tile_buf) outs(%t__tile_Left : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} + %wq_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wq_chunk__tile : !pto.tile_buf) outs(%wq_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%t__tile_Left, %wq_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + } + return + } + func.func @qwen3_decode_layer_incore_1_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c4096 = arith.constant 4096 : i64 + %c4128 = arith.constant 4128 : i64 + %c5152 = arith.constant 5152 : i64 + %c6176 = arith.constant 6176 : i64 + %c8224 = arith.constant 8224 : i64 + %c8736 = arith.constant 8736 : i64 + %c10784 = arith.constant 10784 : i64 + %c11808 = arith.constant 11808 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view + %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_1_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aic} -> i32 + %qwen3_decode_layer_incore_1_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer_import : i32) + %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<4x1xf32> + pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf) + scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { + %5 = arith.muli %arg6, %c4 : index + %6 = arith.addi %5, %ob__ci_idx_v0 : index + %7 = arith.muli %6, %c1 : index + %8 = arith.addi %c0, %7 : index + %9 = arith.muli %8, %c64 : index + %q_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.tmuls ins(%q_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %10 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg5, %10], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf) + %x_chunk__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf + %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %10], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c6176 : !pto.tile_buf + pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf + pto.tcolexpandmul ins(%1, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c5152 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%2 : !pto.tile_buf) + %t__tile_nz = pto.alloc_tile addr = %c8736 : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%t__tile_nz : !pto.tile_buf) + pto.tpush_to_aic(%t__tile_nz : !pto.tile_buf) {split = 0} + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %3 = pto.alloc_tile addr = %c10784 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %q_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%q_acc__tile_mv : !pto.tile_buf) + } + %4 = pto.alloc_tile addr = %c11808 : !pto.tile_buf + pto.tcvt ins(%0{rmode = #pto} : !pto.tile_buf) outs(%4 : !pto.tile_buf) + %q_proj__co_l1_iter_v3_pview = pto.partition_view %q_proj__co_l0_iter_v3_view, offsets = [%arg5, %9], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> + pto.tstore ins(%4 : !pto.tile_buf) outs(%q_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto new file mode 100644 index 000000000..636b81393 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto @@ -0,0 +1,108 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_10_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c4096 = arith.constant 4096 : i64 + %c0i = arith.constant 0 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_10_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 + %qwen3_decode_layer_incore_10_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %0 = arith.muli %arg5, %c8 : index + %1 = arith.addi %0, %ob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %5 = arith.muli %kb__idx_v0, %c128 : index + %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf) + %a_chunk__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf + %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%a_chunk__tile_Left_mat : !pto.tile_buf) outs(%a_chunk__tile_Left : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} + %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_chunk__tile : !pto.tile_buf) outs(%w_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + } + return + } + func.func @qwen3_decode_layer_incore_10_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c4096 = arith.constant 4096 : i64 + %c5120 = arith.constant 5120 : i64 + %c7168 = arith.constant 7168 : i64 + %c8192 = arith.constant 8192 : i64 + %c10240 = arith.constant 10240 : i64 + %c11264 = arith.constant 11264 : i64 + %c9216 = arith.constant 9216 : i64 + %c16 = arith.constant 16 : index + %4 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view + %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%4, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_10_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aic} -> i32 + %qwen3_decode_layer_incore_10_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer_import : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %5 = arith.muli %arg5, %c8 : index + %6 = arith.addi %5, %ob__ci_idx_v0 : index + %7 = arith.muli %6, %c1 : index + %8 = arith.addi %c0, %7 : index + %9 = arith.muli %8, %c64 : index + %o_acc__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.tmuls ins(%o_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %10 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf + %attn_out__rv_v2_pview = pto.partition_view %attn_out__rv_v2_view, offsets = [%arg4, %10], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xf32> + pto.tload ins(%attn_out__rv_v2_pview : !pto.partition_tensor_view<4x128xf32>) outs(%t__tile : !pto.tile_buf) + %a_chunk__tile = pto.alloc_tile addr = %c7168 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%a_chunk__tile : !pto.tile_buf) + %a_chunk__tile_nz = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.tmov ins(%a_chunk__tile : !pto.tile_buf) outs(%a_chunk__tile_nz : !pto.tile_buf) + pto.tpush_to_aic(%a_chunk__tile_nz : !pto.tile_buf) {split = 0} + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %1 = pto.alloc_tile addr = %c10240 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %o_acc__tile_mv = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.tmov ins(%1 : !pto.tile_buf) outs(%o_acc__tile_mv : !pto.tile_buf) + } + %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %9], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%2 : !pto.tile_buf) + %resid__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf + pto.tcvt ins(%2{rmode = #pto} : !pto.tile_buf) outs(%resid__tile : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.tadd ins(%0, %resid__tile : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + %resid1_tile__co_l1_iter_v1_pview = pto.partition_view %resid1_tile__co_l0_iter_v1_view, offsets = [%c0, %9], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xf32> + pto.tstore ins(%3 : !pto.tile_buf) outs(%resid1_tile__co_l1_iter_v1_pview : !pto.partition_tensor_view<4x64xf32>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto new file mode 100644 index 000000000..6eedee90d --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto @@ -0,0 +1,116 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_13_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c1024 = arith.constant 1024 : i64 + %c17408 = arith.constant 17408 : i64 + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c5120 = arith.constant 5120 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_13_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_13_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xbf16> + pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf) + %wg__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf) + %wu__tile = pto.alloc_tile addr = %c17408 : !pto.tile_buf + %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf) + %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%post_chunk__tile : !pto.tile_buf) outs(%post_chunk__tile_Left : !pto.tile_buf) + %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wg__tile : !pto.tile_buf) outs(%wg__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wu__tile : !pto.tile_buf) outs(%wu__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} + } + return + } + func.func @qwen3_decode_layer_incore_13_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c8192 = arith.constant 8192 : i64 + %c9216 = arith.constant 9216 : i64 + %c11264 = arith.constant 11264 : i64 + %c12288 = arith.constant 12288 : i64 + %c10240 = arith.constant 10240 : i64 + %c13312 = arith.constant 13312 : i64 + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c5120 = arith.constant 5120 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %cst_1 = arith.constant 1.000000e+00 : f32 + %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_13_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", size = 8192, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + %gate_acc__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf + %gate_acc__ssa_v0_pview = pto.partition_view %gate_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xf32> + pto.tload ins(%gate_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%gate_acc__tile : !pto.tile_buf) + %up_acc__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf + %up_acc__ssa_v0_pview = pto.partition_view %up_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xf32> + pto.tload ins(%up_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%up_acc__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.tmuls ins(%gate_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c9216 : !pto.tile_buf + pto.tmuls ins(%up_acc__tile, %cst : !pto.tile_buf, f32) outs(%1 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %3 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %4 = pto.alloc_tile addr = %c12288 : !pto.tile_buf + pto.tadd ins(%1, %3 : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %gate_acc__tile_mv = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%gate_acc__tile_mv : !pto.tile_buf) + %up_acc__tile_mv = pto.alloc_tile addr = %c9216 : !pto.tile_buf + pto.tmov ins(%4 : !pto.tile_buf) outs(%up_acc__tile_mv : !pto.tile_buf) + } + %t__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf + pto.tneg ins(%0 : !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c10240 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c10240 : !pto.tile_buf + pto.tadds ins(%5, %cst_1 : !pto.tile_buf, f32) outs(%6 : !pto.tile_buf) + %sigmoid__tile = pto.alloc_tile addr = %c11264 : !pto.tile_buf + pto.trecip ins(%6 : !pto.tile_buf) outs(%sigmoid__tile : !pto.tile_buf) + %7 = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.tmul ins(%0, %sigmoid__tile : !pto.tile_buf, !pto.tile_buf) outs(%7 : !pto.tile_buf) + %mlp_chunk__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.tmul ins(%7, %1 : !pto.tile_buf, !pto.tile_buf) outs(%mlp_chunk__tile : !pto.tile_buf) + %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c13312 : !pto.tile_buf + pto.tcvt ins(%mlp_chunk__tile{rmode = #pto} : !pto.tile_buf) outs(%mlp_chunk_bf16__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> + pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<4x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto new file mode 100644 index 000000000..725a529d2 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto @@ -0,0 +1,73 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c4 = arith.constant 4 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_14_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_14_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> + pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> + pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf) outs(%w_down_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + return + } + func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c16384 = arith.constant 16384 : i64 + %c4 = arith.constant 4 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %down_prev__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xf32> + pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<4x128xf32>) outs(%down_prev__tile : !pto.tile_buf) + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %down_next__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%down_next__tile : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xf32> + pto.tstore ins(%down_next__tile : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<4x128xf32>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto new file mode 100644 index 000000000..dc6456847 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto @@ -0,0 +1,148 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_2_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c4096 = arith.constant 4096 : i64 + %c20480 = arith.constant 20480 : i64 + %c0i = arith.constant 0 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view + %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_2_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 + %qwen3_decode_layer_incore_2_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %1 = arith.muli %arg8, %c8 : index + %2 = arith.addi %1, %ob__ci_idx_v0 : index + %3 = arith.muli %2, %c1 : index + %4 = arith.addi %c0, %3 : index + %5 = arith.muli %4, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %6 = arith.muli %kb__idx_v0, %c128 : index + %wk_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wk_chunk__tile : !pto.tile_buf) + %wv_chunk__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf + %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wv_chunk__tile : !pto.tile_buf) + %normed_bf16__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf + %normed_bf16__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%normed_bf16__tile_Left_mat : !pto.tile_buf) outs(%normed_bf16__tile_Left : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} + %wk_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wk_chunk__tile : !pto.tile_buf) outs(%wk_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%normed_bf16__tile_Left, %wk_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %wv_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wv_chunk__tile : !pto.tile_buf) outs(%wv_chunk__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%normed_bf16__tile_Left, %wv_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} + } + } + return + } + func.func @qwen3_decode_layer_incore_2_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c4096 = arith.constant 4096 : i64 + %c4128 = arith.constant 4128 : i64 + %c5152 = arith.constant 5152 : i64 + %c6176 = arith.constant 6176 : i64 + %c7200 = arith.constant 7200 : i64 + %c9248 = arith.constant 9248 : i64 + %c9760 = arith.constant 9760 : i64 + %c11808 = arith.constant 11808 : i64 + %c12832 = arith.constant 12832 : i64 + %c13856 = arith.constant 13856 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view + %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_2_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aic} -> i32 + %qwen3_decode_layer_incore_2_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer_import : i32) + %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<4x1xf32> + pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %8 = arith.muli %arg8, %c8 : index + %9 = arith.addi %8, %ob__ci_idx_v0 : index + %10 = arith.muli %9, %c1 : index + %11 = arith.addi %c0, %10 : index + %12 = arith.muli %11, %c64 : index + %k_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf + %v_acc__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.tmuls ins(%k_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c5152 : !pto.tile_buf + pto.tmuls ins(%v_acc__tile, %cst : !pto.tile_buf, f32) outs(%1 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %13 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg7, %13], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf) + %x_chunk__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf + %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c7200 : !pto.tile_buf + pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf + pto.tcolexpandmul ins(%2, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %normed_bf16__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%normed_bf16__tile : !pto.tile_buf) + %normed_bf16__tile_nz = pto.alloc_tile addr = %c9760 : !pto.tile_buf + pto.tmov ins(%normed_bf16__tile : !pto.tile_buf) outs(%normed_bf16__tile_nz : !pto.tile_buf) + pto.tpush_to_aic(%normed_bf16__tile_nz : !pto.tile_buf) {split = 0} + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %3 = pto.alloc_tile addr = %c11808 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %4 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %5 = pto.alloc_tile addr = %c12832 : !pto.tile_buf + pto.tadd ins(%1, %4 : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %k_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%k_acc__tile_mv : !pto.tile_buf) + %v_acc__tile_mv = pto.alloc_tile addr = %c5152 : !pto.tile_buf + pto.tmov ins(%5 : !pto.tile_buf) outs(%v_acc__tile_mv : !pto.tile_buf) + } + %6 = pto.alloc_tile addr = %c13856 : !pto.tile_buf + pto.tcvt ins(%0{rmode = #pto} : !pto.tile_buf) outs(%6 : !pto.tile_buf) + %k_proj__co_l1_iter_v3_pview = pto.partition_view %k_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> + pto.tstore ins(%6 : !pto.tile_buf) outs(%k_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>) + %7 = pto.alloc_tile addr = %c13856 : !pto.tile_buf + pto.tcvt ins(%1{rmode = #pto} : !pto.tile_buf) outs(%7 : !pto.tile_buf) + %v_proj__co_l1_iter_v3_pview = pto.partition_view %v_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> + pto.tstore ins(%7 : !pto.tile_buf) outs(%v_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>) + } + return + } +} diff --git a/test/samples/runop.sh b/test/samples/runop.sh index b8c02ff00..6be43d10e 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -19,7 +19,7 @@ PYTHON_BIN="${PYTHON_BIN:-}" PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}" PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}" PTOAS_FLAGS="${PTOAS_FLAGS:-}" -PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Scope2}" +PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Tilelet}" ENABLE_BC=0 usage() { @@ -36,7 +36,7 @@ Env: PTOAS_OUT_DIR # where generated *.mlir/*.cpp go (optional; defaults to a temp dir) PTOAS_FLAGS # extra flags passed to ptoas (e.g. --enable-insert-sync) PTOAS_ENABLE_INSERT_SYNC # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1) - PTO_PTO_DIRS # space-separated dirs to run .pto directly (default: Sync Qwen3Scope2) + PTO_PTO_DIRS # space-separated dirs to run .pto directly (default: Sync Qwen3Tilelet) Flags: --enablebc # enable: python -> .pto -> ptobc -> .pto -> ptoas @@ -153,10 +153,10 @@ process_one_dir() { if [[ "${ENABLE_BC}" == "1" ]]; then use_ptobc_roundtrip=1 fi - # Qwen3 scope2 kernels currently serve as direct ptoas compile-regression + # Qwen3 tilelet kernels currently serve as direct ptoas compile-regression # coverage. They require A5/level3 lowering, but are not expected to # roundtrip through ptobc yet. - if [[ "$A" == "Qwen3Scope2" ]]; then + if [[ "$A" == "Qwen3Tilelet" ]]; then use_ptobc_roundtrip=0 fi local -a ptoas_flags=() @@ -190,7 +190,7 @@ process_one_dir() { fi done fi - if [[ "$A" == "Qwen3Scope2" && $has_pto_arch_override -eq 0 ]]; then + if [[ "$A" == "Qwen3Tilelet" && $has_pto_arch_override -eq 0 ]]; then ptoas_flags+=(--pto-arch a5 --pto-level=level3) target_arch="a5" fi @@ -907,6 +907,13 @@ PY ptobc_file="${out_subdir}/${base}.ptobc" decoded_pto="${out_subdir}/${base}-roundtrip.pto" cpp="${out_subdir}/${base}.cpp" + if [[ "$A" == "Qwen3Tilelet" ]]; then + cpp="${out_subdir}/${base}-pto.cpp" + fi + if [[ "$A" == "Qwen3Tilelet" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then + echo -e "${A}(${base}.pto)\tSKIP\trequires --pto-arch=a5" + continue + fi local sample_use_ptobc_roundtrip="$use_ptobc_roundtrip" # TODO(ptobc): decode of this regression currently fails with From bf764cfc5e5eb8fd9320b3ca53d0a795ddd0ae31 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 10:47:06 +0800 Subject: [PATCH 04/16] test: add Qwen3Tilelet board-validation goldens --- .github/workflows/ci.yml | 13 +- .gitignore | 4 + .../scripts/generate_testcase.py | 449 +++++++++++++----- test/samples/Qwen3Tilelet/README.md | 3 +- .../qwen3_decode_layer_incore_1/golden.py | 71 +++ .../qwen3_decode_layer_incore_10/golden.py | 68 +++ .../qwen3_decode_layer_incore_13/golden.py | 70 +++ .../qwen3_decode_layer_incore_14/golden.py | 60 +++ .../qwen3_decode_layer_incore_2/golden.py | 81 ++++ test/samples/validation_runtime.py | 37 ++ 10 files changed, 736 insertions(+), 120 deletions(-) create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py create mode 100644 test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 01ab24d35..25bf71e2c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ on: skip_cases: description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)" type: string - default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12" + default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp" run_only_cases: description: "Comma/space separated testcase names to run (empty = run all)" type: string @@ -261,14 +261,7 @@ jobs: # Temporary CI gate: skip cases that still error/flap on the remote NPU. # Update this list as we fix the underlying issues. DEFAULT_SKIP_CASES: >- - mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp, - decode_attention_incore_0,decode_attention_incore_1, - decode_attention_incore_2,decode_attention_incore_3, - decode_attention_incore_4,decode_attention_incore_5, - decode_attention_incore_6,decode_attention_incore_7, - decode_attention_incore_8,decode_attention_incore_9, - decode_attention_incore_10,decode_attention_incore_11, - decode_attention_incore_12 + mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp steps: - name: Resolve validation parameters shell: bash @@ -300,7 +293,7 @@ jobs: # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based # on SOC_VERSION to keep the remote validation portable. A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync" - A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5" + A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,qwen3_decode_layer_incore_1,qwen3_decode_layer_incore_2,qwen3_decode_layer_incore_10,qwen3_decode_layer_incore_13,qwen3_decode_layer_incore_14" sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')" is_a5=0 diff --git a/.gitignore b/.gitignore index 44c61b02a..093b87116 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,10 @@ dist/ /remote_npu_validation_results*.tsv /npu_validation/ test/samples/**/npu_validation/ +!test/samples/Qwen3Tilelet/npu_validation/ +!test/samples/Qwen3Tilelet/npu_validation/**/ +!test/samples/Qwen3Tilelet/npu_validation/**/golden.py +!test/samples/Qwen3Tilelet/npu_validation/**/compare.py /tmp_gen* # IDE/editor diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index ca802b567..f1e3aef77 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -85,6 +85,28 @@ "xor", }) +CASE_INT_SCALAR_DEFAULTS = { + "qwen3_decode_layer_incore_13": { + "v7": 64, + }, + "qwen3_decode_layer_incore_14": { + "v4": 1, + "v5": 64, + }, +} + +CASE_POINTER_COUNT_MINIMUMS = { + "qwen3_decode_layer_incore_13": { + "v2": 20480, + "v4": 131046528, + "v5": 131046528, + }, + "qwen3_decode_layer_incore_14": { + "v1": 16384, + "v3": 651264, + }, +} + def _parse_shape(text: str): match = re.search(r"Shape<(\d+)\s*,\s*(\d+)>", text) @@ -96,6 +118,114 @@ def _parse_shape(text: str): return 32, 32 +def _split_params_blob(params_blob: str): + params_blob = params_blob.strip() + if not params_blob: + return [] + params = [] + depth = 0 + start = 0 + for idx, ch in enumerate(params_blob): + if ch == "<": + depth += 1 + elif ch == ">": + depth = max(depth - 1, 0) + elif ch == "," and depth == 0: + params.append(params_blob[start:idx].strip()) + start = idx + 1 + last = params_blob[start:].strip() + if last: + params.append(last) + return params + + +def _find_matching_brace(text: str, open_brace_index: int) -> Optional[int]: + depth = 0 + for idx in range(open_brace_index, len(text)): + ch = text[idx] + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return idx + return None + + +def _extract_aicore_functions(text: str): + pattern = re.compile( + r"(?P__global__\s+)?AICORE\s+void\s+(?P\w+)\s*\((?P[^)]*)\)\s*\{", + re.S, + ) + functions = [] + for match in pattern.finditer(text): + brace_index = text.find("{", match.end("params")) + if brace_index < 0: + continue + end_index = _find_matching_brace(text, brace_index) + if end_index is None: + continue + params_blob = match.group("params").strip() + functions.append( + { + "name": match.group("name"), + "params_blob": params_blob, + "raw_params": _split_params_blob(params_blob), + "is_global": bool(match.group("global")), + "text": text[match.start():end_index + 1], + } + ) + return functions + + +def _describe_kernel_source(text: str): + functions = _extract_aicore_functions(text) + for func in functions: + if func["is_global"]: + return { + "kind": "global", + "kernel_name": func["name"], + "raw_params": func["raw_params"], + "analysis_texts": [func["text"]], + "writer_texts": [func["text"]], + "call_text": func["text"], + } + + mixed_groups = {} + for func in functions: + name = func["name"] + for suffix in ("_aic", "_aiv"): + if not name.endswith(suffix): + continue + base = name[: -len(suffix)] + group = mixed_groups.setdefault(base, {}) + group[suffix[1:]] = func + break + + for base, group in mixed_groups.items(): + if "aic" in group and "aiv" in group: + params = group["aiv"]["raw_params"] or group["aic"]["raw_params"] + return { + "kind": "mixed", + "kernel_name": base, + "raw_params": params, + "analysis_texts": [group["aic"]["text"], group["aiv"]["text"]], + "writer_texts": [group["aiv"]["text"]], + "aic_name": group["aic"]["name"], + "aiv_name": group["aiv"]["name"], + "call_text": group["aiv"]["text"], + } + + return { + "kind": "fallback", + "kernel_name": "kernel", + "raw_params": [], + "analysis_texts": [text], + "writer_texts": [text], + "call_text": text, + } + + def _is_gm_pointer_param(param: str) -> bool: return "__gm__" in param and "*" in param @@ -136,6 +266,44 @@ def _strip_param_name(raw: str, name: str) -> str: return stripped.strip() +def _strip_enclosing_parens(expr: str) -> str: + expr = expr.strip() + while expr.startswith("(") and expr.endswith(")"): + depth = 0 + ok = True + for idx, ch in enumerate(expr): + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if depth == 0 and idx != len(expr) - 1: + ok = False + break + if ok and depth == 0: + expr = expr[1:-1].strip() + else: + break + return expr + + +def _strip_simple_casts(expr: str) -> str: + cur = expr.strip() + for _ in range(8): + prev = cur + cur = _strip_enclosing_parens(cur) + match = re.match(r"^(?:reinterpret_cast|static_cast|const_cast|dynamic_cast)\s*<[^>]+>\s*\((.*)\)$", cur, re.S) + if match: + cur = match.group(1).strip() + continue + match = re.match(r"^\(\s*[^()]+\s*\)\s*(.+)$", cur, re.S) + if match: + cur = match.group(1).strip() + continue + if cur == prev: + break + return cur + + def _infer_void_gm_pointee_type(text: str, param_name: str) -> Optional[str]: # Common patterns in PTOAS-generated kernels: # __gm__ int16_t* v16 = (__gm__ int16_t*) v1; @@ -158,56 +326,88 @@ def _infer_void_gm_pointee_type(text: str, param_name: str) -> Optional[str]: return None -def _detect_output_pointer_param(text: str, pointer_param_names): - if not pointer_param_names: +def _ordered_unique(items): + seen = set() + out = [] + for item in items: + if item in seen: + continue + seen.add(item) + out.append(item) + return out + + +def _resolve_pointer_param_from_expr(expr: str, pointer_param_names, ptr_to_param, ptr_to_base) -> Optional[str]: + if not expr: + return None + cur = _strip_simple_casts(expr) + match = re.match(r"^(\w+)\s*\+", cur) + if match: + cur = match.group(1) + elif re.fullmatch(r"[A-Za-z_]\w*", cur): + cur = cur + else: return None + pointer_params = set(pointer_param_names) + seen = set() + for _ in range(12): + if cur in seen: + break + seen.add(cur) + if cur in pointer_params: + return cur + mapped = ptr_to_param.get(cur) + if mapped: + cur = mapped + continue + mapped = ptr_to_base.get(cur) + if mapped: + cur = mapped + continue + break + return None + + +def _detect_output_pointer_params(text: str, pointer_param_names): + if not pointer_param_names: + return [] + tstore_gts = re.findall(r"\bTSTORE\s*\(\s*(\w+)\s*,", text) if not tstore_gts: - return None + return [] - gt_to_ptr = {} - for m in re.finditer(r"\b(\w+)\s*=\s*[\w:<>]+\s*\(\s*(\w+)\s*[,)]", text): - gt_to_ptr[m.group(1)] = m.group(2) + gt_to_expr = {} + for match in re.finditer( + r"\bGlobalTensor<[^;\n]*>\s+(\w+)\s*=\s*GlobalTensor<[^;\n]*>\(([^,]+?)\s*,", + text, + ): + gt_to_expr.setdefault(match.group(1), match.group(2).strip()) + for match in re.finditer(r"\b(\w+)\s+(\w+)\s*=\s*\1\s*\(([^,]+?)\s*,", text): + gt_to_expr.setdefault(match.group(2), match.group(3).strip()) ptr_to_base = {} - for m in re.finditer(r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*=\s*(\w+)\s*\+", text): - ptr_to_base[m.group(1)] = m.group(2) + for match in re.finditer(r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*=\s*(\w+)\s*\+", text): + ptr_to_base[match.group(1)] = match.group(2) + for match in re.finditer(r"\b(\w+)\s*=\s*(\w+)\s*\+\s*[^;]+;", text): + ptr_to_base.setdefault(match.group(1), match.group(2)) ptr_to_param = {} - for m in re.finditer( + for match in re.finditer( r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*=\s*\(__gm__\s+[\w:<>]+\s*\*\)\s*(\w+)\b", text, ): - ptr_to_param[m.group(1)] = m.group(2) - - def resolve_param(ptr: Optional[str]) -> Optional[str]: - if not ptr: - return None - cur = ptr - seen = set() - for _ in range(8): - if cur in seen: - break - seen.add(cur) - if cur in pointer_param_names: - return cur - mapped = ptr_to_param.get(cur) - if mapped in pointer_param_names: - return mapped - cur = ptr_to_base.get(cur) - if cur is None: - break - return None + ptr_to_param[match.group(1)] = match.group(2) + for match in re.finditer(r"\b(\w+)\s*=\s*\(__gm__\s+[\w:<>]+\s*\*\)\s*(\w+)\b", text): + ptr_to_param.setdefault(match.group(1), match.group(2)) + outputs = [] for gt in tstore_gts: - ptr = gt_to_ptr.get(gt) - if not ptr: - continue - resolved = resolve_param(ptr) - if resolved: - return resolved - return None + expr = gt_to_expr.get(gt) + param = _resolve_pointer_param_from_expr(expr, pointer_param_names, ptr_to_param, ptr_to_base) + if param: + outputs.append(param) + return _ordered_unique(outputs) def _detect_set_ffts_pointer_params(text: str, pointer_param_names): @@ -300,24 +500,7 @@ def _parse_kernel_params(text: str): match = re.search(r"__global__\s+(?:\w+\s+)*void\s+\w+\s*\(([^)]*)\)", text, re.S) if not match: return [] - params_blob = match.group(1).strip() - if not params_blob: - return [] - params = [] - depth = 0 - start = 0 - for idx, ch in enumerate(params_blob): - if ch == "<": - depth += 1 - elif ch == ">": - depth = max(depth - 1, 0) - elif ch == "," and depth == 0: - params.append(params_blob[start:idx].strip()) - start = idx + 1 - last = params_blob[start:].strip() - if last: - params.append(last) - return params + return _split_params_blob(match.group(1)) def _parse_kernel_name(text: str) -> str: @@ -367,6 +550,15 @@ def _default_eps_for_cpp_type(cpp_type: str) -> float: return 0.0 +def _integer_scalar_default_value(testcase: str, name: str, host_type: str) -> Optional[int]: + override = CASE_INT_SCALAR_DEFAULTS.get(testcase, {}).get(name) + if override is not None: + return int(override) + if re.match(r"^(u?int)(8|16|32|64)_t$", host_type) or host_type in {"int", "unsigned", "size_t"}: + return 1 + return None + + def _derive_testcase_name(input_cpp: Path) -> str: name = input_cpp.stem if name.endswith("-pto"): @@ -670,7 +862,7 @@ def ev(node): return ev(parsed) -def _infer_int_var_maxima(kernel_text: str) -> dict: +def _infer_int_var_maxima(kernel_text: str, seed_env: Optional[dict] = None) -> dict: """ Infer max values for simple integer temporaries (e.g. v23) used in pointer arithmetic, by evaluating constant-ish assignments and simple for-loop ranges. @@ -715,7 +907,10 @@ def _infer_int_var_maxima(kernel_text: str) -> dict: step = m.group(4).strip() loops.append((ind, start, end, step)) - maxima: dict[str, Optional[int]] = {} + maxima: dict[str, Optional[int]] = { + k: (None if v is None else int(v)) + for k, v in (seed_env or {}).items() + } def set_max(name: str, value: int) -> bool: cur = maxima.get(name) @@ -760,7 +955,7 @@ def set_max(name: str, value: int) -> bool: return {k: (0 if v is None else int(v)) for k, v in maxima.items()} -def _infer_gm_pointer_elem_counts(kernel_text: str, pointer_param_names): +def _infer_gm_pointer_elem_counts(kernel_text: str, pointer_param_names, seed_int_env: Optional[dict] = None): """ Infer minimum element counts for each __gm__ pointer param from GlobalTensor shape/stride metadata found in PTOAS-generated kernels. @@ -774,7 +969,7 @@ def _infer_gm_pointer_elem_counts(kernel_text: str, pointer_param_names): pointer_params = set(pointer_param_names) - int_max = _infer_int_var_maxima(kernel_text) + int_max = _infer_int_var_maxima(kernel_text, seed_env=seed_int_env) pointer_like = set(pointer_param_names) for m in re.finditer(r"__gm__\s+[\w:<>]+\s*\*\s*(\w+)\s*(?:=[^;]+)?;", kernel_text): @@ -840,25 +1035,6 @@ def resolve_param_and_offset(ptr: str): break return None, None - def strip_enclosing_parens(expr: str) -> str: - expr = expr.strip() - while expr.startswith("(") and expr.endswith(")"): - depth = 0 - ok = True - for i, ch in enumerate(expr): - if ch == "(": - depth += 1 - elif ch == ")": - depth -= 1 - if depth == 0 and i != len(expr) - 1: - ok = False - break - if ok and depth == 0: - expr = expr[1:-1].strip() - else: - break - return expr - def resolve_param_and_offset_expr(ptr_expr: str): """ Resolve a pointer expression passed to GlobalTensor(...) back to a GM @@ -870,22 +1046,22 @@ def resolve_param_and_offset_expr(ptr_expr: str): reinterpret_cast<__gm__ float*>(v1 + expr) (__gm__ float*)(v1 + expr) """ - expr = strip_enclosing_parens(ptr_expr.strip()) + expr = _strip_enclosing_parens(ptr_expr.strip()) if not expr: return None, None m = re.match(r"^(?:reinterpret_cast|static_cast)<[^>]+>\((.*)\)$", expr) if m: - expr = strip_enclosing_parens(m.group(1).strip()) + expr = _strip_enclosing_parens(m.group(1).strip()) # C-style cast prefix: (__gm__ float*)expr / (float*)expr m = re.match(r"^\(\s*__gm__[^)]*\)\s*(.+)$", expr) if m: - expr = strip_enclosing_parens(m.group(1).strip()) + expr = _strip_enclosing_parens(m.group(1).strip()) else: m = re.match(r"^\(\s*[\w:<> ]+\*\s*\)\s*(.+)$", expr) if m: - expr = strip_enclosing_parens(m.group(1).strip()) + expr = _strip_enclosing_parens(m.group(1).strip()) m = re.match(r"^(\w+)\s*\+\s*(.+)$", expr) if m: @@ -990,6 +1166,7 @@ def generate_testcase( raw_kernel = input_cpp.read_text(encoding="utf-8") raw_kernel_for_analysis = raw_kernel + kernel_info = _describe_kernel_source(raw_kernel_for_analysis) # pto.tcmp / pto.tcmps produce packed predicate masks and leave parts of the # logical u8 tile undefined. This can make byte-wise compares flaky. has_packed_pred_mask = re.search(r"\bTCMPS?\s*\(", raw_kernel_for_analysis) is not None @@ -997,12 +1174,20 @@ def generate_testcase( has_dav_vec = "__DAV_VEC__" in raw_kernel has_intra_block_sync = "set_intra_block(" in raw_kernel or "wait_intra_block(" in raw_kernel + is_mixed_kernel = kernel_info["kind"] == "mixed" + if aicore_arch is None: + if is_mixed_kernel: + sv = (soc_version or "").lower() + if "950" in sv or "a5" in sv or "910b" in sv: + aicore_arch = "dav-c310" + else: + aicore_arch = "dav-c220" # Sectioned kernels contain `#if defined(__DAV_CUBE__)` / `__DAV_VEC__` # blocks. For inter-core-style mixed kernels (with intra-block sync), # align to PTO-ISA mix-kernel compile mode (`dav-c310`) so the # toolchain owns DAV macro definition. - if has_dav_cube and has_dav_vec and has_intra_block_sync: + elif has_dav_cube and has_dav_vec and has_intra_block_sync: sv = (soc_version or "").lower() if "950" in sv or "a5" in sv: aicore_arch = "dav-c310" @@ -1028,16 +1213,16 @@ def generate_testcase( # For mix-kernel arch (dav-c310/dav-c220), do not force-define macros. dav_defines = "" is_mix_arch = aicore_arch in {"dav-c310", "dav-c220"} - if not (is_mix_arch and has_dav_cube and has_dav_vec and has_intra_block_sync): + if not is_mix_arch: if has_dav_cube: dav_defines += " -D__DAV_CUBE__" if has_dav_vec: dav_defines += " -D__DAV_VEC__" - rows, cols = _parse_shape(raw_kernel_for_analysis) + rows, cols = _parse_shape(kernel_info["call_text"]) logical_elem_count = rows * cols - kernel_name = _parse_kernel_name(raw_kernel_for_analysis) - raw_params = _parse_kernel_params(raw_kernel_for_analysis) + kernel_name = kernel_info["kernel_name"] + raw_params = kernel_info["raw_params"] mrgsort_block_len = _infer_mrgsort_block_len(raw_kernel_for_analysis) if "TMRGSORT" in raw_kernel_for_analysis else None pointer_param_names = [_extract_cpp_name(p) for p in raw_params if _is_gm_pointer_param(p)] @@ -1055,13 +1240,17 @@ def generate_testcase( ffts_param_names = _detect_set_ffts_pointer_params(raw_kernel_for_analysis, pointer_param_names) non_ffts_pointer_param_names = [n for n in pointer_param_names if n not in ffts_param_names] - output_ptr = _detect_output_pointer_param(raw_kernel_for_analysis, non_ffts_pointer_param_names) - if output_ptr is None and non_ffts_pointer_param_names: - output_ptr = ( + output_param_names = [] + for writer_text in kernel_info["writer_texts"]: + output_param_names.extend(_detect_output_pointer_params(writer_text, non_ffts_pointer_param_names)) + output_param_names = _ordered_unique(output_param_names) + if not output_param_names and non_ffts_pointer_param_names: + output_param_names = [ non_ffts_pointer_param_names[0] if len(non_ffts_pointer_param_names) == 1 else non_ffts_pointer_param_names[-1] - ) + ] + output_param_name_set = set(output_param_names) params = [] for raw in raw_params: @@ -1080,7 +1269,7 @@ def generate_testcase( "role": ( "ffts" if name in ffts_param_names - else ("output" if name == output_ptr else "input") + else ("output" if name in output_param_name_set else "input") ), } ) @@ -1106,7 +1295,20 @@ def generate_testcase( init_ptrs = list(data_ptrs) output_ptrs = [p for p in data_ptrs if p["role"] == "output"] - inferred_counts = _infer_gm_pointer_elem_counts(raw_kernel_for_analysis, pointer_param_names) + scalar_int_defaults = { + p["name"]: default_value + for p in params + if p["kind"] == "scalar" + for default_value in [_integer_scalar_default_value(testcase, p["name"], p["host_type"])] + if default_value is not None + } + inferred_counts = {} + for analysis_text in kernel_info["analysis_texts"]: + partial_counts = _infer_gm_pointer_elem_counts(analysis_text, pointer_param_names, seed_int_env=scalar_int_defaults) + for name, count in partial_counts.items(): + inferred_counts[name] = max(inferred_counts.get(name, 0), count) + for name, count in CASE_POINTER_COUNT_MINIMUMS.get(testcase, {}).items(): + inferred_counts[name] = max(inferred_counts.get(name, 0), int(count)) ptr_elem_counts = {} for p in data_ptrs: inferred = inferred_counts.get(p["name"]) @@ -1153,7 +1355,7 @@ def generate_testcase( if t == "bool": value = "true" elif re.match(r"^(u?int)(8|16|32|64)_t$", t) or t in {"int", "unsigned", "size_t"}: - value = "1" + value = str(_integer_scalar_default_value(testcase, p["name"], t) or 1) elif t in {"float"}: value = "1.0f" elif t in {"double"}: @@ -1429,22 +1631,51 @@ def generate_testcase( kernel_call_args_host = ", ".join(kernel_call_args_host) raw_params_host = [_rewrite_host_unsupported_types(p) for p in raw_params] launch_block_count = _infer_launch_block_count(raw_kernel_for_analysis, testcase) - launch_cpp = ( - INCLUDE_REPLACEMENT - + "\n" - "#if defined(__CCE_AICORE__)\n" - f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n" - "#else\n" - f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n" - "#endif\n\n" - f"void {launch_name}({launch_fn_params}) {{\n" - "#if defined(__CCE_AICORE__)\n" - f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n" - "#else\n" - f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n" - "#endif\n" - f"}}\n" - ) + if is_mixed_kernel: + wrapper_call_args = ", ".join([p["name"] for p in params]) + launch_cpp = ( + INCLUDE_REPLACEMENT + + "\n" + "#if defined(__CCE_AICORE__)\n" + f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params)});\n" + f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params)});\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n" + f" {kernel_info['aic_name']}({wrapper_call_args});\n" + f" {kernel_info['aiv_name']}({wrapper_call_args});\n" + "}\n" + "#else\n" + f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params_host)});\n" + f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params_host)});\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)}) {{\n" + f" {kernel_info['aic_name']}({wrapper_call_args});\n" + f" {kernel_info['aiv_name']}({wrapper_call_args});\n" + "}\n" + "#endif\n\n" + f"void {launch_name}({launch_fn_params}) {{\n" + "#if defined(__CCE_AICORE__)\n" + f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n" + "#else\n" + f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n" + "#endif\n" + f"}}\n" + ) + else: + launch_cpp = ( + INCLUDE_REPLACEMENT + + "\n" + "#if defined(__CCE_AICORE__)\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n" + "#else\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n" + "#endif\n\n" + f"void {launch_name}({launch_fn_params}) {{\n" + "#if defined(__CCE_AICORE__)\n" + f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n" + "#else\n" + f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n" + "#endif\n" + f"}}\n" + ) (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8") # pto-isa selects instruction implementations based on MEMORY_BASE vs diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md index 34e3a51bf..9132ac959 100644 --- a/test/samples/Qwen3Tilelet/README.md +++ b/test/samples/Qwen3Tilelet/README.md @@ -10,4 +10,5 @@ Notes: `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`, `qwen3_decode_layer_incore_14`. - This sample directory vendors only those direct `ptoas` regression inputs. -- No custom `golden.py` or `compare.py` is included here: these grouped mixed kernels depend on orchestration-managed peer buffers and loop-carried context, so per-kernel numerical validation is not a drop-in replacement for the full PyPTO runtime flow. +- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation. +- Custom `golden.py` assets cover these 5 cases as standalone mixed-kernel regression tests on A5. diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py new file mode 100644 index 000000000..5e8eeadbe --- /dev/null +++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray: + if positive: + return generator.uniform(0.5, 1.5, size=count).astype(np.float32) + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + meta = load_case_meta() + generator = rng() + b0, ob = load_int32_assignments()[:2] + + buffers = { + "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05), + "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True), + "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True), + "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]), + "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.05), + } + + inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1) + output = np.zeros_like(buffers["v4"]) + + for ob_ci in range(4): + q0 = (ob * 4 + ob_ci) * 64 + acc = np.zeros((4, 64), dtype=np.float32) + for kb in range(40): + k0 = kb * 128 + x_chunk = bf16_to_float32( + load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) + ) + gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) + w_chunk = bf16_to_float32( + load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120) + ) + acc += (x_chunk * inv_rms * gamma) @ w_chunk + output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120) + + write_buffers(meta, buffers) + write_golden(meta, {"v4": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py new file mode 100644 index 000000000..19bcac622 --- /dev/null +++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + meta = load_case_meta() + generator = rng() + b0, ob = load_int32_assignments()[:2] + + buffers = { + "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.05), + "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.05), + "v3": np.zeros(meta.elem_counts["v3"], dtype=meta.np_types["v3"]), + "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.05), + } + + output = np.zeros_like(buffers["v3"]) + + for ob_ci in range(8): + o0 = (ob * 8 + ob_ci) * 64 + acc = np.zeros((4, 64), dtype=np.float32) + for kb in range(40): + k0 = kb * 128 + attn_chunk = load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) + attn_chunk = bf16_to_float32(float32_to_bf16(attn_chunk)) + w_chunk = bf16_to_float32( + load_strided_2d(buffers["v4"], offset=k0 * 5120 + o0, rows=128, cols=64, row_stride=5120) + ) + acc += attn_chunk @ w_chunk + resid = bf16_to_float32( + load_strided_2d(buffers["v2"], offset=b0 * 5120 + o0, rows=4, cols=64, row_stride=5120) + ) + output = store_strided_2d(output, acc + resid, offset=o0, row_stride=5120) + + write_buffers(meta, buffers) + write_golden(meta, {"v3": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py new file mode 100644 index 000000000..2fe6818ae --- /dev/null +++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py @@ -0,0 +1,70 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + meta = load_case_meta() + generator = rng() + o0 = load_int32_assignments()[0] + + buffers = { + "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01), + "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01), + "v3": make_fp32(generator, meta.elem_counts["v3"], scale=0.01), + "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.01), + "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.01), + "v6": np.zeros(meta.elem_counts["v6"], dtype=meta.np_types["v6"]), + } + + gate_acc = np.zeros((4, 64), dtype=np.float32) + up_acc = np.zeros((4, 64), dtype=np.float32) + + for kb in range(40): + k0 = kb * 128 + post_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=k0, rows=4, cols=128, row_stride=5120)) + w_gate = bf16_to_float32( + load_strided_2d(buffers["v4"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600) + ) + w_up = bf16_to_float32( + load_strided_2d(buffers["v5"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600) + ) + gate_acc += post_chunk @ w_gate + up_acc += post_chunk @ w_up + + sigmoid = np.reciprocal(1.0 + np.exp(-gate_acc)) + mlp_chunk = gate_acc * sigmoid * up_acc + output = float32_to_bf16(mlp_chunk) + + write_buffers(meta, buffers) + write_golden(meta, {"v6": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py new file mode 100644 index 000000000..d913d746e --- /dev/null +++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py @@ -0,0 +1,60 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + meta = load_case_meta() + generator = rng() + dob, o0 = load_int32_assignments()[:2] + + buffers = { + "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01), + "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01), + "v3": make_bf16(generator, meta.elem_counts["v3"], scale=0.01), + } + + output = np.array(buffers["v1"], copy=True) + mlp_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=0, rows=4, cols=64, row_stride=64)) + + for dob_ci in range(4): + d0 = (dob * 4 + dob_ci) * 128 + down_prev = load_strided_2d(output, offset=d0, rows=4, cols=128, row_stride=5120).astype(np.float32) + w_down = bf16_to_float32( + load_strided_2d(buffers["v3"], offset=o0 * 5120 + d0, rows=64, cols=128, row_stride=5120) + ) + output = store_strided_2d(output, down_prev + mlp_chunk @ w_down, offset=d0, row_stride=5120) + + write_buffers(meta, buffers) + write_golden(meta, {"v1": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py new file mode 100644 index 000000000..59a46b188 --- /dev/null +++ b/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py @@ -0,0 +1,81 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray: + if positive: + return generator.uniform(0.5, 1.5, size=count).astype(np.float32) + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + meta = load_case_meta() + generator = rng() + b0, ob = load_int32_assignments()[:2] + + buffers = { + "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05), + "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True), + "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True), + "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]), + "v5": np.zeros(meta.elem_counts["v5"], dtype=meta.np_types["v5"]), + "v6": make_bf16(generator, meta.elem_counts["v6"], scale=0.05), + "v7": make_bf16(generator, meta.elem_counts["v7"], scale=0.05), + } + + inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1) + k_proj = np.zeros_like(buffers["v4"]) + v_proj = np.zeros_like(buffers["v5"]) + + for ob_ci in range(8): + kv0 = (ob * 8 + ob_ci) * 64 + k_acc = np.zeros((4, 64), dtype=np.float32) + v_acc = np.zeros((4, 64), dtype=np.float32) + for kb in range(40): + k0 = kb * 128 + x_chunk = bf16_to_float32( + load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) + ) + gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) + normed = x_chunk * inv_rms * gamma + wk_chunk = bf16_to_float32( + load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) + ) + wv_chunk = bf16_to_float32( + load_strided_2d(buffers["v7"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) + ) + k_acc += normed @ wk_chunk + v_acc += normed @ wv_chunk + k_proj = store_strided_2d(k_proj, float32_to_bf16(k_acc), offset=b0 * 1024 + kv0, row_stride=1024) + v_proj = store_strided_2d(v_proj, float32_to_bf16(v_acc), offset=b0 * 1024 + kv0, row_stride=1024) + + write_buffers(meta, buffers) + write_golden(meta, {"v4": k_proj, "v5": v_proj}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py index fdeedfd05..247020a8d 100644 --- a/test/samples/validation_runtime.py +++ b/test/samples/validation_runtime.py @@ -89,6 +89,43 @@ def is_a5_soc() -> bool: return '950' in sv or 'a5' in sv or '910_95' in sv +def bf16_to_float32(values: np.ndarray) -> np.ndarray: + values_u16 = np.asarray(values, dtype=np.uint16) + return (values_u16.astype(np.uint32) << 16).view(np.float32) + + +def float32_to_bf16(values: np.ndarray) -> np.ndarray: + values_f32 = np.asarray(values, dtype=np.float32) + bits = values_f32.view(np.uint32) + round_bias = np.uint32(0x7FFF) + ((bits >> 16) & np.uint32(1)) + return ((bits + round_bias) >> 16).astype(np.uint16) + + +def load_strided_2d(buffer, *, offset: int, rows: int, cols: int, row_stride: int) -> np.ndarray: + flat = np.asarray(buffer).reshape(-1) + tile = np.empty((rows, cols), dtype=flat.dtype) + for row in range(rows): + start = offset + row * row_stride + stop = start + cols + if stop > flat.size: + raise ValueError(f'strided load out of bounds: [{start}:{stop}] > {flat.size}') + tile[row, :] = flat[start:stop] + return tile + + +def store_strided_2d(buffer, tile, *, offset: int, row_stride: int): + flat = np.asarray(buffer).reshape(-1) + tile_arr = np.asarray(tile) + rows, cols = tile_arr.shape + for row in range(rows): + start = offset + row * row_stride + stop = start + cols + if stop > flat.size: + raise ValueError(f'strided store out of bounds: [{start}:{stop}] > {flat.size}') + flat[start:stop] = tile_arr[row] + return flat + + def float_values(generator, count: int, *, style: str) -> np.ndarray: if style == 'signed': values = generator.uniform(-3.0, 3.0, size=count).astype(np.float32) From 5fad1616b399f86ed0264a1deb51d8aa5e01307a Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 11:04:16 +0800 Subject: [PATCH 05/16] test: align Qwen3Tilelet golden layout --- .gitignore | 4 ---- test/samples/Qwen3Tilelet/README.md | 2 +- .../golden.py => qwen3_decode_layer_incore_10_golden.py} | 0 .../golden.py => qwen3_decode_layer_incore_13_golden.py} | 0 .../golden.py => qwen3_decode_layer_incore_14_golden.py} | 0 .../golden.py => qwen3_decode_layer_incore_1_golden.py} | 0 .../golden.py => qwen3_decode_layer_incore_2_golden.py} | 0 7 files changed, 1 insertion(+), 5 deletions(-) rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_10/golden.py => qwen3_decode_layer_incore_10_golden.py} (100%) rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_13/golden.py => qwen3_decode_layer_incore_13_golden.py} (100%) rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_14/golden.py => qwen3_decode_layer_incore_14_golden.py} (100%) rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_1/golden.py => qwen3_decode_layer_incore_1_golden.py} (100%) rename test/samples/Qwen3Tilelet/{npu_validation/qwen3_decode_layer_incore_2/golden.py => qwen3_decode_layer_incore_2_golden.py} (100%) diff --git a/.gitignore b/.gitignore index 093b87116..44c61b02a 100644 --- a/.gitignore +++ b/.gitignore @@ -64,10 +64,6 @@ dist/ /remote_npu_validation_results*.tsv /npu_validation/ test/samples/**/npu_validation/ -!test/samples/Qwen3Tilelet/npu_validation/ -!test/samples/Qwen3Tilelet/npu_validation/**/ -!test/samples/Qwen3Tilelet/npu_validation/**/golden.py -!test/samples/Qwen3Tilelet/npu_validation/**/compare.py /tmp_gen* # IDE/editor diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md index 9132ac959..3298e0b97 100644 --- a/test/samples/Qwen3Tilelet/README.md +++ b/test/samples/Qwen3Tilelet/README.md @@ -11,4 +11,4 @@ Notes: `qwen3_decode_layer_incore_14`. - This sample directory vendors only those direct `ptoas` regression inputs. - `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation. -- Custom `golden.py` assets cover these 5 cases as standalone mixed-kernel regression tests on A5. +- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `_golden.py`. diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py similarity index 100% rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_10/golden.py rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py similarity index 100% rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_13/golden.py rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py similarity index 100% rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_14/golden.py rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py similarity index 100% rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_1/golden.py rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py diff --git a/test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py similarity index 100% rename from test/samples/Qwen3Tilelet/npu_validation/qwen3_decode_layer_incore_2/golden.py rename to test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py From ddd9941f83b6f4bb436715360c5dc1dd1e7be12c Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 11:09:38 +0800 Subject: [PATCH 06/16] chore: add missing PR386 license headers --- .github/scripts/compute_ptoas_version.py | 7 +++++++ test/samples/validation_runtime.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/.github/scripts/compute_ptoas_version.py b/.github/scripts/compute_ptoas_version.py index 85205b6b8..92f4bd627 100644 --- a/.github/scripts/compute_ptoas_version.py +++ b/.github/scripts/compute_ptoas_version.py @@ -1,4 +1,11 @@ #!/usr/bin/env python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. import argparse import pathlib diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py index 247020a8d..b97f8a861 100644 --- a/test/samples/validation_runtime.py +++ b/test/samples/validation_runtime.py @@ -1,4 +1,12 @@ #!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + import os import re import sys From 7fe8bcf543bbd2ac35d80e8b6cbbf2d152179c4a Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 15:11:28 +0800 Subject: [PATCH 07/16] test: model bf16 requantize in Qwen3 goldens --- .../Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py | 7 ++++++- .../Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py index 5e8eeadbe..3a7d64a75 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py @@ -32,6 +32,10 @@ def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: return float32_to_bf16(make_fp32(generator, count, scale=scale)) +def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray: + return bf16_to_float32(float32_to_bf16(values)) + + def main(): meta = load_case_meta() generator = rng() @@ -57,10 +61,11 @@ def main(): load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) ) gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) + normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma) w_chunk = bf16_to_float32( load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120) ) - acc += (x_chunk * inv_rms * gamma) @ w_chunk + acc += normed @ w_chunk output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120) write_buffers(meta, buffers) diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py index 59a46b188..347ca7c0a 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py @@ -32,6 +32,10 @@ def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: return float32_to_bf16(make_fp32(generator, count, scale=scale)) +def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray: + return bf16_to_float32(float32_to_bf16(values)) + + def main(): meta = load_case_meta() generator = rng() @@ -61,7 +65,7 @@ def main(): load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) ) gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) - normed = x_chunk * inv_rms * gamma + normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma) wk_chunk = bf16_to_float32( load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) ) From 1183a9298643801024de61f190482399238f398f Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 16:38:16 +0800 Subject: [PATCH 08/16] test: regenerate qwen3 tilelet PTO inputs with M16 --- test/samples/Qwen3Tilelet/README.md | 12 +- .../qwen3_decode_layer_incore_0.pto | 23 ++ .../qwen3_decode_layer_incore_1.pto | 121 ++--------- .../qwen3_decode_layer_incore_10.pto | 122 ++--------- .../qwen3_decode_layer_incore_10_golden.py | 68 ------ .../qwen3_decode_layer_incore_11.pto | 118 +++++++++++ .../qwen3_decode_layer_incore_12.pto | 31 +++ .../qwen3_decode_layer_incore_13.pto | 121 ++--------- .../qwen3_decode_layer_incore_13_golden.py | 70 ------- .../qwen3_decode_layer_incore_14.pto | 128 +++++++----- .../qwen3_decode_layer_incore_14_golden.py | 60 ------ .../qwen3_decode_layer_incore_15.pto | 47 +++++ .../qwen3_decode_layer_incore_16.pto | 49 +++++ .../qwen3_decode_layer_incore_17.pto | 104 ++++++++++ .../qwen3_decode_layer_incore_18.pto | 75 +++++++ .../qwen3_decode_layer_incore_19.pto | 36 ++++ .../qwen3_decode_layer_incore_1_golden.py | 76 ------- .../qwen3_decode_layer_incore_2.pto | 196 ++++++------------ .../qwen3_decode_layer_incore_2_golden.py | 85 -------- .../qwen3_decode_layer_incore_3.pto | 45 ++++ .../qwen3_decode_layer_incore_4.pto | 46 ++++ .../qwen3_decode_layer_incore_5.pto | 46 ++++ .../qwen3_decode_layer_incore_6.pto | 88 ++++++++ .../qwen3_decode_layer_incore_7.pto | 92 ++++++++ .../qwen3_decode_layer_incore_8.pto | 30 +++ .../qwen3_decode_layer_incore_9.pto | 49 +++++ 26 files changed, 1063 insertions(+), 875 deletions(-) create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md index 3298e0b97..4f78ed37f 100644 --- a/test/samples/Qwen3Tilelet/README.md +++ b/test/samples/Qwen3Tilelet/README.md @@ -1,14 +1,10 @@ Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_tilelet.py`. Scope: -- compile-regression inputs for `ptoas` +- direct `ptoas` compile-regression inputs - A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS` Notes: -- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs: - `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`, - `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`, - `qwen3_decode_layer_incore_14`. -- This sample directory vendors only those direct `ptoas` regression inputs. -- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation. -- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `_golden.py`. +- The current tilelet lowering emits 20 kernel fragments (`aiv`, `aic`, and mixed-kernel `.pto` files). This directory vendors those emitted `.pto` inputs directly, flattened into one sample directory for `runop.sh`. +- These files are regenerated from the tilelet example with `BATCH_TILE=16` / M=16 lowering. +- The directory is compile-regression focused; stale custom NPU-validation goldens for the old M=4 split are intentionally dropped here. diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto new file mode 100644 index 000000000..856f60659 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto @@ -0,0 +1,23 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_0(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %attn_out__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %zero_q__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%zero_q__tile : !pto.tile_buf) + %zero_attn__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.tcvt ins(%zero_q__tile{rmode = #pto} : !pto.tile_buf) outs(%zero_attn__tile : !pto.tile_buf) + %q_proj__iter_v1_pview = pto.partition_view %q_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%zero_q__tile : !pto.tile_buf) outs(%q_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) + %attn_out__iter_v1_pview = pto.partition_view %attn_out__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%zero_attn__tile : !pto.tile_buf) outs(%attn_out__iter_v1_pview : !pto.partition_tensor_view<16x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto index 77b0b5c33..2d0902b60 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto @@ -1,116 +1,23 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_1_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c4096 = arith.constant 4096 : i64 + func.func @qwen3_decode_layer_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 - %c16 = arith.constant 16 : index - %c5120 = arith.constant 5120 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view - %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_1_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 - %qwen3_decode_layer_incore_1_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer : i32) - scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { - %0 = arith.muli %arg6, %c4 : index - %1 = arith.addi %0, %ob__ci_idx_v0 : index - %2 = arith.muli %1, %c1 : index - %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c64 : index - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %5 = arith.muli %kb__idx_v0, %c128 : index - %wq_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wq_chunk__tile : !pto.tile_buf) - %t__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf - %t__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%t__tile_Left_mat : !pto.tile_buf) outs(%t__tile_Left : !pto.tile_buf) - pto.tfree_from_aiv {split = 0} - %wq_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wq_chunk__tile : !pto.tile_buf) outs(%wq_chunk__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%t__tile_Left, %wq_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - } - } - return - } - func.func @qwen3_decode_layer_incore_1_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c4096 = arith.constant 4096 : i64 - %c4128 = arith.constant 4128 : i64 - %c5152 = arith.constant 5152 : i64 - %c6176 = arith.constant 6176 : i64 - %c8224 = arith.constant 8224 : i64 - %c8736 = arith.constant 8736 : i64 - %c10784 = arith.constant 10784 : i64 - %c11808 = arith.constant 11808 : i64 %c16 = arith.constant 16 : index - %c5120 = arith.constant 5120 : index + %c1024 = arith.constant 1024 : index %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %c64 = arith.constant 64 : index %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %cst = arith.constant 0.000000e+00 : f32 - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view - %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_1_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aic} -> i32 - %qwen3_decode_layer_incore_1_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer_import : i32) - %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<4x1xf32> - pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf) - scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { - %5 = arith.muli %arg6, %c4 : index - %6 = arith.addi %5, %ob__ci_idx_v0 : index - %7 = arith.muli %6, %c1 : index - %8 = arith.addi %c0, %7 : index - %9 = arith.muli %8, %c64 : index - %q_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf - %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf - pto.tmuls ins(%q_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %10 = arith.muli %kb__idx_v0, %c128 : index - %t__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf - %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg5, %10], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xbf16> - pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf) - %x_chunk__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) - %gamma__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf - %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %10], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c6176 : !pto.tile_buf - pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %normed__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf - pto.tcolexpandmul ins(%1, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c5152 : !pto.tile_buf - pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%2 : !pto.tile_buf) - %t__tile_nz = pto.alloc_tile addr = %c8736 : !pto.tile_buf - pto.tmov ins(%2 : !pto.tile_buf) outs(%t__tile_nz : !pto.tile_buf) - pto.tpush_to_aic(%t__tile_nz : !pto.tile_buf) {split = 0} - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %3 = pto.alloc_tile addr = %c10784 : !pto.tile_buf - pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %q_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf - pto.tmov ins(%3 : !pto.tile_buf) outs(%q_acc__tile_mv : !pto.tile_buf) - } - %4 = pto.alloc_tile addr = %c11808 : !pto.tile_buf - pto.tcvt ins(%0{rmode = #pto} : !pto.tile_buf) outs(%4 : !pto.tile_buf) - %q_proj__co_l1_iter_v3_pview = pto.partition_view %q_proj__co_l0_iter_v3_view, offsets = [%arg5, %9], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> - pto.tstore ins(%4 : !pto.tile_buf) outs(%q_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>) - } + %k_proj__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %zero_k__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%zero_k__tile : !pto.tile_buf) + %zero_v__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%zero_v__tile : !pto.tile_buf) + %k_proj__iter_v1_pview = pto.partition_view %k_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%zero_k__tile : !pto.tile_buf) outs(%k_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) + %v_proj__iter_v1_pview = pto.partition_view %v_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%zero_v__tile : !pto.tile_buf) outs(%v_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) return } } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto index 636b81393..bc49f96e4 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto @@ -1,108 +1,30 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_10_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c4096 = arith.constant 4096 : i64 + func.func @qwen3_decode_layer_incore_10(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 %c16 = arith.constant 16 : index - %c5120 = arith.constant 5120 : index + %c64 = arith.constant 64 : index %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_10_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 - %qwen3_decode_layer_incore_10_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer : i32) - scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { - %0 = arith.muli %arg5, %c8 : index - %1 = arith.addi %0, %ob__ci_idx_v0 : index - %2 = arith.muli %1, %c1 : index - %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c64 : index - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %5 = arith.muli %kb__idx_v0, %c128 : index - %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf) - %a_chunk__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf - %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%a_chunk__tile_Left_mat : !pto.tile_buf) outs(%a_chunk__tile_Left : !pto.tile_buf) - pto.tfree_from_aiv {split = 0} - %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%w_chunk__tile : !pto.tile_buf) outs(%w_chunk__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - } - } - return - } - func.func @qwen3_decode_layer_incore_10_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c4096 = arith.constant 4096 : i64 - %c5120 = arith.constant 5120 : i64 - %c7168 = arith.constant 7168 : i64 - %c8192 = arith.constant 8192 : i64 - %c10240 = arith.constant 10240 : i64 - %c11264 = arith.constant 11264 : i64 - %c9216 = arith.constant 9216 : i64 - %c16 = arith.constant 16 : index - %4 = arith.constant 5120 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %cst = arith.constant 0.000000e+00 : f32 - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view - %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c4, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view - %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%4, %4], strides = [%4, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_10_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aic} -> i32 - %qwen3_decode_layer_incore_10_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer_import : i32) - scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { - %5 = arith.muli %arg5, %c8 : index - %6 = arith.addi %5, %ob__ci_idx_v0 : index - %7 = arith.muli %6, %c1 : index - %8 = arith.addi %c0, %7 : index - %9 = arith.muli %8, %c64 : index - %o_acc__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf - pto.tmuls ins(%o_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %10 = arith.muli %kb__idx_v0, %c128 : index - %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf - %attn_out__rv_v2_pview = pto.partition_view %attn_out__rv_v2_view, offsets = [%arg4, %10], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xf32> - pto.tload ins(%attn_out__rv_v2_pview : !pto.partition_tensor_view<4x128xf32>) outs(%t__tile : !pto.tile_buf) - %a_chunk__tile = pto.alloc_tile addr = %c7168 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%a_chunk__tile : !pto.tile_buf) - %a_chunk__tile_nz = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.tmov ins(%a_chunk__tile : !pto.tile_buf) outs(%a_chunk__tile_nz : !pto.tile_buf) - pto.tpush_to_aic(%a_chunk__tile_nz : !pto.tile_buf) {split = 0} - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %1 = pto.alloc_tile addr = %c10240 : !pto.tile_buf - pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %o_acc__tile_mv = pto.alloc_tile addr = %c4096 : !pto.tile_buf - pto.tmov ins(%1 : !pto.tile_buf) outs(%o_acc__tile_mv : !pto.tile_buf) - } - %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf - %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %9], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> - pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%2 : !pto.tile_buf) - %resid__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf - pto.tcvt ins(%2{rmode = #pto} : !pto.tile_buf) outs(%resid__tile : !pto.tile_buf) - %3 = pto.alloc_tile addr = %c4096 : !pto.tile_buf - pto.tadd ins(%0, %resid__tile : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) - %resid1_tile__co_l1_iter_v1_pview = pto.partition_view %resid1_tile__co_l0_iter_v1_view, offsets = [%c0, %9], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xf32> - pto.tstore ins(%3 : !pto.tile_buf) outs(%resid1_tile__co_l1_iter_v1_pview : !pto.partition_tensor_view<4x64xf32>) - } + %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> + pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%v_tile__tile : !pto.tile_buf) outs(%v_tile__tile_Right : !pto.tile_buf) + %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%oi_tmp_pad__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>) return } } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py deleted file mode 100644 index 19bcac622..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import numpy as np - -from validation_runtime import ( - bf16_to_float32, - float32_to_bf16, - load_case_meta, - load_int32_assignments, - load_strided_2d, - rng, - store_strided_2d, - write_buffers, - write_golden, -) - - -def make_fp32(generator, count: int, *, scale: float = 0.05) -> np.ndarray: - return generator.uniform(-scale, scale, size=count).astype(np.float32) - - -def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: - return float32_to_bf16(make_fp32(generator, count, scale=scale)) - - -def main(): - meta = load_case_meta() - generator = rng() - b0, ob = load_int32_assignments()[:2] - - buffers = { - "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.05), - "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.05), - "v3": np.zeros(meta.elem_counts["v3"], dtype=meta.np_types["v3"]), - "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.05), - } - - output = np.zeros_like(buffers["v3"]) - - for ob_ci in range(8): - o0 = (ob * 8 + ob_ci) * 64 - acc = np.zeros((4, 64), dtype=np.float32) - for kb in range(40): - k0 = kb * 128 - attn_chunk = load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) - attn_chunk = bf16_to_float32(float32_to_bf16(attn_chunk)) - w_chunk = bf16_to_float32( - load_strided_2d(buffers["v4"], offset=k0 * 5120 + o0, rows=128, cols=64, row_stride=5120) - ) - acc += attn_chunk @ w_chunk - resid = bf16_to_float32( - load_strided_2d(buffers["v2"], offset=b0 * 5120 + o0, rows=4, cols=64, row_stride=5120) - ) - output = store_strided_2d(output, acc + resid, offset=o0, row_stride=5120) - - write_buffers(meta, buffers) - write_golden(meta, {"v3": output}) - - -if __name__ == "__main__": - main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto new file mode 100644 index 000000000..9a8a29a01 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto @@ -0,0 +1,118 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_11(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c32 = arith.constant 32 : i64 + %c64 = arith.constant 64 : i64 + %c96 = arith.constant 96 : i64 + %c128 = arith.constant 128 : i64 + %c4224 = arith.constant 4224 : i64 + %c8320 = arith.constant 8320 : i64 + %c12416 = arith.constant 12416 : i64 + %c12448 = arith.constant 12448 : i64 + %c12480 = arith.constant 12480 : i64 + %c12512 = arith.constant 12512 : i64 + %c12544 = arith.constant 12544 : i64 + %c12576 = arith.constant 12576 : i64 + %c12608 = arith.constant 12608 : i64 + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c12416 : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c12448 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf + %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %arg9, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c12416 : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c12448 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c12480 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c12480 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c12480 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c12480 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c12608 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c12608 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c12608 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto new file mode 100644 index 000000000..a9c4f9bee --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto @@ -0,0 +1,31 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_12(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c32 = arith.constant 32 : i64 + %c4128 = arith.constant 4128 : i64 + %c1 = arith.constant 1 : index + %c8192 = arith.constant 8192 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) + %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf, !pto.tile_buf) outs(%ctx__tile : !pto.tile_buf) + %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %ctx_flat_bf16__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.tcvt ins(%ctx_flat__tile{rmode = #pto} : !pto.tile_buf) outs(%ctx_flat_bf16__tile : !pto.tile_buf) + %0 = arith.muli %arg3, %c128 : index + %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view -> !pto.partition_tensor_view<1x1024xbf16> + pto.tstore ins(%ctx_flat_bf16__tile : !pto.tile_buf) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto index 6eedee90d..8b38aaf7e 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto @@ -1,116 +1,21 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_13_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_13(%arg0: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 - %c1024 = arith.constant 1024 : i64 - %c17408 = arith.constant 17408 : i64 - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index %c1 = arith.constant 1 : index - %c5120 = arith.constant 5120 : index - %c25600 = arith.constant 25600 : index - %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_13_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_13_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %1 = arith.muli %kb__idx_v0, %c128 : index - %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xbf16> - pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf) - %wg__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf) - %wu__tile = pto.alloc_tile addr = %c17408 : !pto.tile_buf - %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf) - %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%post_chunk__tile : !pto.tile_buf) outs(%post_chunk__tile_Left : !pto.tile_buf) - %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wg__tile : !pto.tile_buf) outs(%wg__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wu__tile : !pto.tile_buf) outs(%wu__tile_Right : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %resid1_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + scf.for %ob__idx_v0 = %c0 to %c128 step %c1 { + %0 = arith.muli %ob__idx_v0, %c64 : index + %zero_resid1__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%zero_resid1__tile : !pto.tile_buf) + %resid1_tile__iter_v1_pview = pto.partition_view %resid1_tile__ssa_v0_view, offsets = [%c0, %0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%zero_resid1__tile : !pto.tile_buf) outs(%resid1_tile__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) } return } - func.func @qwen3_decode_layer_incore_13_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c8192 = arith.constant 8192 : i64 - %c9216 = arith.constant 9216 : i64 - %c11264 = arith.constant 11264 : i64 - %c12288 = arith.constant 12288 : i64 - %c10240 = arith.constant 10240 : i64 - %c13312 = arith.constant 13312 : i64 - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %c5120 = arith.constant 5120 : index - %c25600 = arith.constant 25600 : index - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %c40 = arith.constant 40 : index - %cst_1 = arith.constant 1.000000e+00 : f32 - %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_13_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", size = 8192, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) - %gate_acc__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf - %gate_acc__ssa_v0_pview = pto.partition_view %gate_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xf32> - pto.tload ins(%gate_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%gate_acc__tile : !pto.tile_buf) - %up_acc__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf - %up_acc__ssa_v0_pview = pto.partition_view %up_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xf32> - pto.tload ins(%up_acc__ssa_v0_pview : !pto.partition_tensor_view<4x64xf32>) outs(%up_acc__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.tmuls ins(%gate_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c9216 : !pto.tile_buf - pto.tmuls ins(%up_acc__tile, %cst : !pto.tile_buf, f32) outs(%1 : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %2 = pto.alloc_tile addr = %c11264 : !pto.tile_buf - pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %3 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %4 = pto.alloc_tile addr = %c12288 : !pto.tile_buf - pto.tadd ins(%1, %3 : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %gate_acc__tile_mv = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.tmov ins(%2 : !pto.tile_buf) outs(%gate_acc__tile_mv : !pto.tile_buf) - %up_acc__tile_mv = pto.alloc_tile addr = %c9216 : !pto.tile_buf - pto.tmov ins(%4 : !pto.tile_buf) outs(%up_acc__tile_mv : !pto.tile_buf) - } - %t__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf - pto.tneg ins(%0 : !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %5 = pto.alloc_tile addr = %c10240 : !pto.tile_buf - pto.texp ins(%t__tile : !pto.tile_buf) outs(%5 : !pto.tile_buf) - %6 = pto.alloc_tile addr = %c10240 : !pto.tile_buf - pto.tadds ins(%5, %cst_1 : !pto.tile_buf, f32) outs(%6 : !pto.tile_buf) - %sigmoid__tile = pto.alloc_tile addr = %c11264 : !pto.tile_buf - pto.trecip ins(%6 : !pto.tile_buf) outs(%sigmoid__tile : !pto.tile_buf) - %7 = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.tmul ins(%0, %sigmoid__tile : !pto.tile_buf, !pto.tile_buf) outs(%7 : !pto.tile_buf) - %mlp_chunk__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.tmul ins(%7, %1 : !pto.tile_buf, !pto.tile_buf) outs(%mlp_chunk__tile : !pto.tile_buf) - %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c13312 : !pto.tile_buf - pto.tcvt ins(%mlp_chunk__tile{rmode = #pto} : !pto.tile_buf) outs(%mlp_chunk_bf16__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> - pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<4x64xbf16>) - return - } } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py deleted file mode 100644 index 2fe6818ae..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import numpy as np - -from validation_runtime import ( - bf16_to_float32, - float32_to_bf16, - load_case_meta, - load_int32_assignments, - load_strided_2d, - rng, - write_buffers, - write_golden, -) - - -def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray: - return generator.uniform(-scale, scale, size=count).astype(np.float32) - - -def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray: - return float32_to_bf16(make_fp32(generator, count, scale=scale)) - - -def main(): - meta = load_case_meta() - generator = rng() - o0 = load_int32_assignments()[0] - - buffers = { - "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01), - "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01), - "v3": make_fp32(generator, meta.elem_counts["v3"], scale=0.01), - "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.01), - "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.01), - "v6": np.zeros(meta.elem_counts["v6"], dtype=meta.np_types["v6"]), - } - - gate_acc = np.zeros((4, 64), dtype=np.float32) - up_acc = np.zeros((4, 64), dtype=np.float32) - - for kb in range(40): - k0 = kb * 128 - post_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=k0, rows=4, cols=128, row_stride=5120)) - w_gate = bf16_to_float32( - load_strided_2d(buffers["v4"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600) - ) - w_up = bf16_to_float32( - load_strided_2d(buffers["v5"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600) - ) - gate_acc += post_chunk @ w_gate - up_acc += post_chunk @ w_up - - sigmoid = np.reciprocal(1.0 + np.exp(-gate_acc)) - mlp_chunk = gate_acc * sigmoid * up_acc - output = float32_to_bf16(mlp_chunk) - - write_buffers(meta, buffers) - write_golden(meta, {"v6": output}) - - -if __name__ == "__main__": - main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto index 725a529d2..0a0172824 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto @@ -1,72 +1,90 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 - %c16384 = arith.constant 16384 : i64 - %c4 = arith.constant 4 : index - %c5120 = arith.constant 5120 : index + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c25600 = arith.constant 25600 : index %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view %qwen3_decode_layer_incore_14_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_14_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { - %0 = arith.muli %arg3, %c4 : index - %1 = arith.addi %0, %dob__ci_idx_v0 : index + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %0 = arith.muli %arg5, %c8 : index + %1 = arith.addi %0, %ob__ci_idx_v0 : index %2 = arith.muli %1, %c1 : index %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c128 : index - %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> - pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf) - %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> - pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<4x64xbf16>) outs(%lhs_mat : !pto.tile_buf) - %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) - %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf) outs(%w_down_chunk__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %4 = arith.muli %3, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { + %5 = arith.muli %kb__idx_v0, %c128 : index + %a_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %attn_out__rv_v5_pview = pto.partition_view %attn_out__rv_v5_view, offsets = [%arg4, %5], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%attn_out__rv_v5_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%a_chunk__tile : !pto.tile_buf) + %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf) + %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%a_chunk__tile : !pto.tile_buf) outs(%a_chunk__tile_Left : !pto.tile_buf) + %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_chunk__tile : !pto.tile_buf) outs(%w_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } } return } - func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c16384 = arith.constant 16384 : i64 - %c4 = arith.constant 4 : index - %c5120 = arith.constant 5120 : index + func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c32768 = arith.constant 32768 : i64 + %c40960 = arith.constant 40960 : i64 + %c45056 = arith.constant 45056 : i64 + %c36864 = arith.constant 36864 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c25600 = arith.constant 25600 : index %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c4, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c4, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 2048} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { - %0 = arith.muli %arg3, %c4 : index - %1 = arith.addi %0, %dob__ci_idx_v0 : index - %2 = arith.muli %1, %c1 : index - %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c128 : index - %down_prev__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xf32> - pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<4x128xf32>) outs(%down_prev__tile : !pto.tile_buf) - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %down_next__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf - pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%down_next__tile : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xf32> - pto.tstore ins(%down_next__tile : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<4x128xf32>) + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 32768, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %2 = arith.muli %arg5, %c8 : index + %3 = arith.addi %2, %ob__ci_idx_v0 : index + %4 = arith.muli %3, %c1 : index + %5 = arith.addi %c0, %4 : index + %6 = arith.muli %5, %c64 : index + %o_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%o_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %0 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tadd ins(%o_acc__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %o_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmov ins(%0 : !pto.tile_buf) outs(%o_acc__tile_mv : !pto.tile_buf) + } + %t__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %6], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%t__tile : !pto.tile_buf) + %resid__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%resid__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tadd ins(%o_acc__tile, %resid__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %resid1_tile__co_l1_iter_v4_pview = pto.partition_view %resid1_tile__co_l0_iter_v4_view, offsets = [%c0, %6], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%1 : !pto.tile_buf) outs(%resid1_tile__co_l1_iter_v4_pview : !pto.partition_tensor_view<16x64xf32>) } return } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py deleted file mode 100644 index d913d746e..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import numpy as np - -from validation_runtime import ( - bf16_to_float32, - float32_to_bf16, - load_case_meta, - load_int32_assignments, - load_strided_2d, - rng, - store_strided_2d, - write_buffers, - write_golden, -) - - -def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray: - return generator.uniform(-scale, scale, size=count).astype(np.float32) - - -def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray: - return float32_to_bf16(make_fp32(generator, count, scale=scale)) - - -def main(): - meta = load_case_meta() - generator = rng() - dob, o0 = load_int32_assignments()[:2] - - buffers = { - "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01), - "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01), - "v3": make_bf16(generator, meta.elem_counts["v3"], scale=0.01), - } - - output = np.array(buffers["v1"], copy=True) - mlp_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=0, rows=4, cols=64, row_stride=64)) - - for dob_ci in range(4): - d0 = (dob * 4 + dob_ci) * 128 - down_prev = load_strided_2d(output, offset=d0, rows=4, cols=128, row_stride=5120).astype(np.float32) - w_down = bf16_to_float32( - load_strided_2d(buffers["v3"], offset=o0 * 5120 + d0, rows=64, cols=128, row_stride=5120) - ) - output = store_strided_2d(output, down_prev + mlp_chunk @ w_down, offset=d0, row_stride=5120) - - write_buffers(meta, buffers) - write_golden(meta, {"v1": output}) - - -if __name__ == "__main__": - main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto new file mode 100644 index 000000000..a45c9a509 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto @@ -0,0 +1,47 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_15(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8256 = arith.constant 8256 : i64 + %c16448 = arith.constant 16448 : i64 + %c16512 = arith.constant 16512 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %5 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %cst_1 = arith.constant 1.220703e-04 : f32 + %cst_2 = arith.constant 1.000000e-06 : f32 + %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout}: !pto.tensor_view + %sq_sum__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%sq_sum__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %5 step %c1 { + %6 = arith.muli %kb__idx_v0, %c128 : index + %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %6], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %tmp_tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf + pto.trowsum ins(%t__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16448 : !pto.tile_buf + %2 = pto.alloc_tile addr = %c16512 : !pto.tile_buf + pto.tadd ins(%sq_sum__tile, %1 : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %sq_sum__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%sq_sum__tile_mv : !pto.tile_buf) + } + %3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmuls ins(%sq_sum__tile, %cst_1 : !pto.tile_buf, f32) outs(%3 : !pto.tile_buf) + %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tadds ins(%3, %cst_2 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) + %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.trsqrt ins(%4 : !pto.tile_buf) outs(%inv_rms__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + pto.tstore ins(%inv_rms__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<1x16xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto new file mode 100644 index 000000000..f9fa660d1 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto @@ -0,0 +1,49 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_16(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8256 = arith.constant 8256 : i64 + %c8768 = arith.constant 8768 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %2 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %cst = arith.constant 0.000000e+00 : f32 + %down_proj_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_norm_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_rms_weight__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg4, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %inv_rms__ssa_v0_pview = pto.partition_view %inv_rms__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + pto.tload ins(%inv_rms__ssa_v0_pview : !pto.partition_tensor_view<1x16xf32>) outs(%inv_rms__tile : !pto.tile_buf) + scf.for %zi__idx_v0 = %c0 to %2 step %c1 { + %3 = arith.muli %zi__idx_v0, %c128 : index + %down_zero_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%down_zero_chunk__tile : !pto.tile_buf) + %down_proj_tile__iter_v1_pview = pto.partition_view %down_proj_tile__ssa_v0_view, offsets = [%c0, %3], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%down_zero_chunk__tile : !pto.tile_buf) outs(%down_proj_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) + } + scf.for %kb__idx_v0 = %c0 to %2 step %c1 { + %4 = arith.muli %kb__idx_v0, %c128 : index + %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf + %post_rms_weight__ssa_v0_pview = pto.partition_view %post_rms_weight__ssa_v0_view, offsets = [%c0, %4], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%post_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %0 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.trowexpandmul ins(%x_chunk__tile, %t__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tcolexpandmul ins(%0, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c8768 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%1 : !pto.tile_buf) + %post_norm_tile__iter_v1_pview = pto.partition_view %post_norm_tile__ssa_v0_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tstore ins(%1 : !pto.tile_buf) outs(%post_norm_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto new file mode 100644 index 000000000..ae6570c56 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto @@ -0,0 +1,104 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_17_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c20480 = arith.constant 20480 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c25600 = arith.constant 25600 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_17_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_17_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf) + %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf) + %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf + %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf) + %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%post_chunk__tile : !pto.tile_buf) outs(%post_chunk__tile_Left : !pto.tile_buf) + %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wg__tile : !pto.tile_buf) outs(%wg__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wu__tile : !pto.tile_buf) outs(%wu__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} + } + return + } + func.func @qwen3_decode_layer_incore_17_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c32768 = arith.constant 32768 : i64 + %c36864 = arith.constant 36864 : i64 + %c45056 = arith.constant 45056 : i64 + %c49152 = arith.constant 49152 : i64 + %c40960 = arith.constant 40960 : i64 + %c53248 = arith.constant 53248 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c25600 = arith.constant 25600 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %cst_1 = arith.constant 1.000000e+00 : f32 + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_17_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", size = 32768, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%gate_acc__tile : !pto.tile_buf) + %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%up_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %0 = pto.alloc_tile addr = %c45056 : !pto.tile_buf + pto.tadd ins(%gate_acc__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %1 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %2 = pto.alloc_tile addr = %c49152 : !pto.tile_buf + pto.tadd ins(%up_acc__tile, %1 : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmov ins(%0 : !pto.tile_buf) outs(%gate_acc__tile_mv : !pto.tile_buf) + %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%up_acc__tile_mv : !pto.tile_buf) + } + %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tneg ins(%gate_acc__tile : !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%3 : !pto.tile_buf) + %4 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tadds ins(%3, %cst_1 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) + %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf + pto.trecip ins(%4 : !pto.tile_buf) outs(%sigmoid__tile : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmul ins(%gate_acc__tile, %sigmoid__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmul ins(%5, %up_acc__tile : !pto.tile_buf, !pto.tile_buf) outs(%mlp_chunk__tile : !pto.tile_buf) + %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf + pto.tcvt ins(%mlp_chunk__tile{rmode = #pto} : !pto.tile_buf) outs(%mlp_chunk_bf16__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto new file mode 100644 index 000000000..3228a9f80 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto @@ -0,0 +1,75 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_18_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_18_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_18_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> + pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf) outs(%w_down_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + return + } + func.func @qwen3_decode_layer_incore_18_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c65536 = arith.constant 65536 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_18_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", size = 65536, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf + %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf) + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf + pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%down_next__tile : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%down_next__tile : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<16x128xf32>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto new file mode 100644 index 000000000..776c7aed2 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto @@ -0,0 +1,36 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_19(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c8192 = arith.constant 8192 : i64 + %c16384 = arith.constant 16384 : i64 + %c16 = arith.constant 16 : index + %2 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view + %out__co_l0_iter_v3_view = pto.make_tensor_view %arg1, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view + scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { + %3 = arith.muli %arg4, %c4 : index + %4 = arith.addi %3, %ob__ci_idx_v0 : index + %5 = arith.muli %4, %c1 : index + %6 = arith.addi %c0, %5 : index + %7 = arith.muli %6, %c128 : index + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %down_proj_tile__rv_v5_pview = pto.partition_view %down_proj_tile__rv_v5_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%down_proj_tile__rv_v5_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf + %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%0 : !pto.tile_buf) + %down_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tadd ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%down_acc__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16384 : !pto.tile_buf + pto.tcvt ins(%down_acc__tile{rmode = #pto} : !pto.tile_buf) outs(%1 : !pto.tile_buf) + %out__co_l1_iter_v3_pview = pto.partition_view %out__co_l0_iter_v3_view, offsets = [%arg3, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tstore ins(%1 : !pto.tile_buf) outs(%out__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py deleted file mode 100644 index 3a7d64a75..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import numpy as np - -from validation_runtime import ( - bf16_to_float32, - float32_to_bf16, - load_case_meta, - load_int32_assignments, - load_strided_2d, - rng, - store_strided_2d, - write_buffers, - write_golden, -) - - -def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray: - if positive: - return generator.uniform(0.5, 1.5, size=count).astype(np.float32) - return generator.uniform(-scale, scale, size=count).astype(np.float32) - - -def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: - return float32_to_bf16(make_fp32(generator, count, scale=scale)) - - -def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray: - return bf16_to_float32(float32_to_bf16(values)) - - -def main(): - meta = load_case_meta() - generator = rng() - b0, ob = load_int32_assignments()[:2] - - buffers = { - "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05), - "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True), - "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True), - "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]), - "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.05), - } - - inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1) - output = np.zeros_like(buffers["v4"]) - - for ob_ci in range(4): - q0 = (ob * 4 + ob_ci) * 64 - acc = np.zeros((4, 64), dtype=np.float32) - for kb in range(40): - k0 = kb * 128 - x_chunk = bf16_to_float32( - load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) - ) - gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) - normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma) - w_chunk = bf16_to_float32( - load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120) - ) - acc += normed @ w_chunk - output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120) - - write_buffers(meta, buffers) - write_golden(meta, {"v4": output}) - - -if __name__ == "__main__": - main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto index dc6456847..9fbf4425d 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto @@ -1,147 +1,67 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_2_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c4096 = arith.constant 4096 : i64 - %c20480 = arith.constant 20480 : i64 + func.func @qwen3_decode_layer_incore_2(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c4160 = arith.constant 4160 : i64 + %c12352 = arith.constant 12352 : i64 + %c20544 = arith.constant 20544 : i64 + %c20608 = arith.constant 20608 : i64 + %c20672 = arith.constant 20672 : i64 %c16 = arith.constant 16 : index - %c5120 = arith.constant 5120 : index + %c8192 = arith.constant 8192 : index %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c1024 = arith.constant 1024 : index + %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view - %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_2_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 - %qwen3_decode_layer_incore_2_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer : i32) - scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { - %1 = arith.muli %arg8, %c8 : index - %2 = arith.addi %1, %ob__ci_idx_v0 : index - %3 = arith.muli %2, %c1 : index - %4 = arith.addi %c0, %3 : index - %5 = arith.muli %4, %c64 : index - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %6 = arith.muli %kb__idx_v0, %c128 : index - %wk_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wk_chunk__tile : !pto.tile_buf) - %wv_chunk__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf - %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wv_chunk__tile : !pto.tile_buf) - %normed_bf16__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf - %normed_bf16__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%normed_bf16__tile_Left_mat : !pto.tile_buf) outs(%normed_bf16__tile_Left : !pto.tile_buf) - pto.tfree_from_aiv {split = 0} - %wk_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wk_chunk__tile : !pto.tile_buf) outs(%wk_chunk__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%normed_bf16__tile_Left, %wk_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - %wv_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wv_chunk__tile : !pto.tile_buf) outs(%wv_chunk__tile_Right : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%normed_bf16__tile_Left, %wv_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} - } + %10 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %cst_1 = arith.constant 1.220703e-04 : f32 + %cst_2 = arith.constant 1.000000e-06 : f32 + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %normed_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %partial_sq__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%partial_sq__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %10 step %c1 { + %11 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %11], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf) + %x_chunk__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c4160 : !pto.tile_buf + pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %tmp_tile = pto.alloc_tile addr = %c12352 : !pto.tile_buf + %1 = pto.alloc_tile addr = %c20544 : !pto.tile_buf + pto.trowsum ins(%0, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c20544 : !pto.tile_buf + %3 = pto.alloc_tile addr = %c20608 : !pto.tile_buf + pto.tadd ins(%partial_sq__tile, %2 : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + %partial_sq__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%partial_sq__tile_mv : !pto.tile_buf) } - return - } - func.func @qwen3_decode_layer_incore_2_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c4096 = arith.constant 4096 : i64 - %c4128 = arith.constant 4128 : i64 - %c5152 = arith.constant 5152 : i64 - %c6176 = arith.constant 6176 : i64 - %c7200 = arith.constant 7200 : i64 - %c9248 = arith.constant 9248 : i64 - %c9760 = arith.constant 9760 : i64 - %c11808 = arith.constant 11808 : i64 - %c12832 = arith.constant 12832 : i64 - %c13856 = arith.constant 13856 : i64 - %c16 = arith.constant 16 : index - %c5120 = arith.constant 5120 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %cst = arith.constant 0.000000e+00 : f32 - %c40 = arith.constant 40 : index - %c128 = arith.constant 128 : index - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view - %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c4, %c1], strides = [%c1, %c4] {layout = #pto.layout}: !pto.tensor_view - %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_2_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aic} -> i32 - %qwen3_decode_layer_incore_2_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", size = 4096, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer_import : i32) - %inv_rms_tile__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c4, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<4x1xf32> - pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<4x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf) - scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { - %8 = arith.muli %arg8, %c8 : index - %9 = arith.addi %8, %ob__ci_idx_v0 : index - %10 = arith.muli %9, %c1 : index - %11 = arith.addi %c0, %10 : index - %12 = arith.muli %11, %c64 : index - %k_acc__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf - %v_acc__tile = pto.alloc_tile addr = %c5152 : !pto.tile_buf - %0 = pto.alloc_tile addr = %c4128 : !pto.tile_buf - pto.tmuls ins(%k_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c5152 : !pto.tile_buf - pto.tmuls ins(%v_acc__tile, %cst : !pto.tile_buf, f32) outs(%1 : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { - %13 = arith.muli %kb__idx_v0, %c128 : index - %t__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf - %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg7, %13], sizes = [%c4, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<4x128xbf16> - pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<4x128xbf16>) outs(%t__tile : !pto.tile_buf) - %x_chunk__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) - %gamma__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf - %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c7200 : !pto.tile_buf - pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - %normed__tile = pto.alloc_tile addr = %c7200 : !pto.tile_buf - pto.tcolexpandmul ins(%2, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) - %normed_bf16__tile = pto.alloc_tile addr = %c6176 : !pto.tile_buf - pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%normed_bf16__tile : !pto.tile_buf) - %normed_bf16__tile_nz = pto.alloc_tile addr = %c9760 : !pto.tile_buf - pto.tmov ins(%normed_bf16__tile : !pto.tile_buf) outs(%normed_bf16__tile_nz : !pto.tile_buf) - pto.tpush_to_aic(%normed_bf16__tile_nz : !pto.tile_buf) {split = 0} - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %3 = pto.alloc_tile addr = %c11808 : !pto.tile_buf - pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %4 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %5 = pto.alloc_tile addr = %c12832 : !pto.tile_buf - pto.tadd ins(%1, %4 : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %k_acc__tile_mv = pto.alloc_tile addr = %c4128 : !pto.tile_buf - pto.tmov ins(%3 : !pto.tile_buf) outs(%k_acc__tile_mv : !pto.tile_buf) - %v_acc__tile_mv = pto.alloc_tile addr = %c5152 : !pto.tile_buf - pto.tmov ins(%5 : !pto.tile_buf) outs(%v_acc__tile_mv : !pto.tile_buf) - } - %6 = pto.alloc_tile addr = %c13856 : !pto.tile_buf - pto.tcvt ins(%0{rmode = #pto} : !pto.tile_buf) outs(%6 : !pto.tile_buf) - %k_proj__co_l1_iter_v3_pview = pto.partition_view %k_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> - pto.tstore ins(%6 : !pto.tile_buf) outs(%k_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>) - %7 = pto.alloc_tile addr = %c13856 : !pto.tile_buf - pto.tcvt ins(%1{rmode = #pto} : !pto.tile_buf) outs(%7 : !pto.tile_buf) - %v_proj__co_l1_iter_v3_pview = pto.partition_view %v_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c4, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<4x64xbf16> - pto.tstore ins(%7 : !pto.tile_buf) outs(%v_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<4x64xbf16>) + %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmuls ins(%partial_sq__tile, %cst_1 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tadds ins(%4, %cst_2 : !pto.tile_buf, f32) outs(%5 : !pto.tile_buf) + %variance__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + scf.for %12 = %c0 to %10 step %c1 { + %13 = arith.muli %12, %c128 : index + %6 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %14 = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %13], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%14 : !pto.partition_tensor_view<16x128xbf16>) outs(%6 : !pto.tile_buf) + %7 = pto.alloc_tile addr = %c4160 : !pto.tile_buf + pto.tcvt ins(%6{rmode = #pto} : !pto.tile_buf) outs(%7 : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c20672 : !pto.tile_buf + %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %8 = pto.alloc_tile addr = %c4160 : !pto.tile_buf + pto.trowexpandmul ins(%7, %variance__tile : !pto.tile_buf, !pto.tile_buf) outs(%8 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf + pto.tcolexpandmul ins(%8, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %9 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%9 : !pto.tile_buf) + %normed_tile__iter_v1_pview = pto.partition_view %normed_tile__ssa_v0_view, offsets = [%c0, %13], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tstore ins(%9 : !pto.tile_buf) outs(%normed_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>) } return } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py deleted file mode 100644 index 347ca7c0a..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import numpy as np - -from validation_runtime import ( - bf16_to_float32, - float32_to_bf16, - load_case_meta, - load_int32_assignments, - load_strided_2d, - rng, - store_strided_2d, - write_buffers, - write_golden, -) - - -def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray: - if positive: - return generator.uniform(0.5, 1.5, size=count).astype(np.float32) - return generator.uniform(-scale, scale, size=count).astype(np.float32) - - -def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: - return float32_to_bf16(make_fp32(generator, count, scale=scale)) - - -def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray: - return bf16_to_float32(float32_to_bf16(values)) - - -def main(): - meta = load_case_meta() - generator = rng() - b0, ob = load_int32_assignments()[:2] - - buffers = { - "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05), - "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True), - "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True), - "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]), - "v5": np.zeros(meta.elem_counts["v5"], dtype=meta.np_types["v5"]), - "v6": make_bf16(generator, meta.elem_counts["v6"], scale=0.05), - "v7": make_bf16(generator, meta.elem_counts["v7"], scale=0.05), - } - - inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(4, 1) - k_proj = np.zeros_like(buffers["v4"]) - v_proj = np.zeros_like(buffers["v5"]) - - for ob_ci in range(8): - kv0 = (ob * 8 + ob_ci) * 64 - k_acc = np.zeros((4, 64), dtype=np.float32) - v_acc = np.zeros((4, 64), dtype=np.float32) - for kb in range(40): - k0 = kb * 128 - x_chunk = bf16_to_float32( - load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=4, cols=128, row_stride=5120) - ) - gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) - normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma) - wk_chunk = bf16_to_float32( - load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) - ) - wv_chunk = bf16_to_float32( - load_strided_2d(buffers["v7"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) - ) - k_acc += normed @ wk_chunk - v_acc += normed @ wv_chunk - k_proj = store_strided_2d(k_proj, float32_to_bf16(k_acc), offset=b0 * 1024 + kv0, row_stride=1024) - v_proj = store_strided_2d(v_proj, float32_to_bf16(v_acc), offset=b0 * 1024 + kv0, row_stride=1024) - - write_buffers(meta, buffers) - write_golden(meta, {"v4": k_proj, "v5": v_proj}) - - -if __name__ == "__main__": - main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto new file mode 100644 index 000000000..f8bccc6ac --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto @@ -0,0 +1,45 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_3(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %wq__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) + %tile_b__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b__tile : !pto.tile_buf) + %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) + %tile_b__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_b__tile : !pto.tile_buf) outs(%tile_b__tile_Right : !pto.tile_buf) + %q_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%tile_a__tile_Left, %tile_b__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%q_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) + %tile_b_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %3 = pto.partition_view %wq__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b_i__tile : !pto.tile_buf) + %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) + %tile_b_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_b_i__tile : !pto.tile_buf) outs(%tile_b_i__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_b_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + } + %q_proj__iter_v6_pview = pto.partition_view %q_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%q_acc__tile : !pto.tile_buf) outs(%q_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto new file mode 100644 index 000000000..9a2756f1c --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto @@ -0,0 +1,46 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_4(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c1024 = arith.constant 1024 : index + %c1 = arith.constant 1 : index + %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %k_proj__iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %normed_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %wk__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) + %tile_wk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk__tile : !pto.tile_buf) + %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) + %tile_wk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wk__tile : !pto.tile_buf) outs(%tile_wk__tile_Right : !pto.tile_buf) + %k_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%tile_a__tile_Left, %tile_wk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%k_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) + %tile_wk_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %3 = pto.partition_view %wk__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk_i__tile : !pto.tile_buf) + %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) + %tile_wk_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wk_i__tile : !pto.tile_buf) outs(%tile_wk_i__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wk_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + } + %k_proj__iter_v6_pview = pto.partition_view %k_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%k_acc__tile : !pto.tile_buf) outs(%k_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto new file mode 100644 index 000000000..db88c9a68 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto @@ -0,0 +1,46 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_5(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wv__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) + %tile_wv__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv__tile : !pto.tile_buf) + %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) + %tile_wv__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wv__tile : !pto.tile_buf) outs(%tile_wv__tile_Right : !pto.tile_buf) + %v_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%tile_a__tile_Left, %tile_wv__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%v_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) + %tile_wv_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %3 = pto.partition_view %wv__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv_i__tile : !pto.tile_buf) + %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) + %tile_wv_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wv_i__tile : !pto.tile_buf) outs(%tile_wv_i__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wv_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + } + %v_proj__iter_v6_pview = pto.partition_view %v_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%v_acc__tile : !pto.tile_buf) outs(%v_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto new file mode 100644 index 000000000..4443956bc --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto @@ -0,0 +1,88 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_6(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: index, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c512 = arith.constant 512 : i64 + %c768 = arith.constant 768 : i64 + %c1024 = arith.constant 1024 : i64 + %c1280 = arith.constant 1280 : i64 + %c1536 = arith.constant 1536 : i64 + %c1792 = arith.constant 1792 : i64 + %c2048 = arith.constant 2048 : i64 + %c2176 = arith.constant 2176 : i64 + %c2688 = arith.constant 2688 : i64 + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %7 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c4096 = arith.constant 4096 : index + %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_cache__iter_v1_view = pto.make_tensor_view %arg6, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__rv_v5_view = pto.make_tensor_view %arg7, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) + %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) + %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf + %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) + %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf + %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) + scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { + %8 = arith.muli %ki__idx_v0, %c128 : index + %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %k_proj__rv_v5_pview = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %8], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%k_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%k_lo__tile : !pto.tile_buf) + %k_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf + %10 = arith.addi %8, %c64 : index + %9 = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %10], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%9 : !pto.partition_tensor_view<1x64xf32>) outs(%k_hi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf + pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %rot_lo__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%rot_lo__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c1280 : !pto.tile_buf + pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%rot_hi__tile : !pto.tile_buf) + %11 = arith.muli %arg8, %c8 : index + %12 = arith.muli %11, %c4096 : index + %13 = arith.muli %ki__idx_v0, %c4096 : index + %14 = arith.addi %12, %13 : index + %15 = arith.addi %14, %arg9 : index + %3 = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%rot_lo__tile{rmode = #pto} : !pto.tile_buf) outs(%3 : !pto.tile_buf) + %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%3 : !pto.tile_buf) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x64xbf16>) + %4 = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%rot_hi__tile{rmode = #pto} : !pto.tile_buf) outs(%4 : !pto.tile_buf) + %k_cache__tile_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c64], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%4 : !pto.tile_buf) outs(%k_cache__tile_pview : !pto.partition_tensor_view<1x64xbf16>) + %5 = pto.alloc_tile addr = %c2176 : !pto.tile_buf + %17 = arith.muli %ki__idx_v0, %c128 : index + %v_proj__rv_v5_pview = pto.partition_view %v_proj__rv_v5_view, offsets = [%arg8, %17], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%v_proj__rv_v5_pview : !pto.partition_tensor_view<1x128xf32>) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c2688 : !pto.tile_buf + pto.tcvt ins(%5{rmode = #pto} : !pto.tile_buf) outs(%6 : !pto.tile_buf) + %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tstore ins(%6 : !pto.tile_buf) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto new file mode 100644 index 000000000..2f80eb162 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto @@ -0,0 +1,92 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: index, %arg10: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c512 = arith.constant 512 : i64 + %c768 = arith.constant 768 : i64 + %c1024 = arith.constant 1024 : i64 + %c1280 = arith.constant 1280 : i64 + %c1536 = arith.constant 1536 : i64 + %c1792 = arith.constant 1792 : i64 + %c2048 = arith.constant 2048 : i64 + %c2176 = arith.constant 2176 : i64 + %c2304 = arith.constant 2304 : i64 + %c6400 = arith.constant 6400 : i64 + %c6432 = arith.constant 6432 : i64 + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %c8192 = arith.constant 8192 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_padded__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) + %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) + %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf + %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) + %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf + %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) + scf.for %qi__idx_v0 = %c0 to %c8 step %c1 { + %5 = arith.addi %arg10, %qi__idx_v0 : index + %6 = arith.muli %5, %c128 : index + %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %q_proj__rv_v5_pview = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %6], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%q_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%q_lo__tile : !pto.tile_buf) + %q_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf + %8 = arith.addi %6, %c64 : index + %7 = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %8], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%7 : !pto.partition_tensor_view<1x64xf32>) outs(%q_hi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf + pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %rot_lo_bf16__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%1{rmode = #pto} : !pto.tile_buf) outs(%rot_lo_bf16__tile : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c1280 : !pto.tile_buf + pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + %4 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tadd ins(%2, %3 : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %rot_hi_bf16__tile = pto.alloc_tile addr = %c2176 : !pto.tile_buf + pto.tcvt ins(%4{rmode = #pto} : !pto.tile_buf) outs(%rot_hi_bf16__tile : !pto.tile_buf) + %q_padded__iter_v1_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%rot_lo_bf16__tile : !pto.tile_buf) outs(%q_padded__iter_v1_pview : !pto.partition_tensor_view<1x64xbf16>) + %q_padded__tile_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c64], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%rot_hi_bf16__tile : !pto.tile_buf) outs(%q_padded__tile_pview : !pto.partition_tensor_view<1x64xbf16>) + } + %oi__tile = pto.alloc_tile addr = %c2304 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf) + %li_flat__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf + %mi_flat__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%oi__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto new file mode 100644 index 000000000..53988ea99 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto @@ -0,0 +1,30 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_8(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout}: !pto.tensor_view + %q_padded__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %q_padded__rv_v2_pview = pto.partition_view %q_padded__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%q_padded__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%k_tile__tile : !pto.tile_buf) outs(%k_tile__tile_Right : !pto.tile_buf) + %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%raw_scores_pad__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto new file mode 100644 index 000000000..eb677daf6 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto @@ -0,0 +1,49 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_9(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c2048 = arith.constant 2048 : i64 + %c4096 = arith.constant 4096 : i64 + %c8192 = arith.constant 8192 : i64 + %c8224 = arith.constant 8224 : i64 + %c9248 = arith.constant 9248 : i64 + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 8.838835e-02 : f32 + %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf + %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf) + pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf + %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf) outs(%scores_padded__tile : !pto.tile_buf) + %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf, f32) outs(%scores__tile : !pto.tile_buf) + %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%cur_mi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%exp_scores__tile : !pto.tile_buf) + %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf + pto.tcvt ins(%exp_scores__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_bf16__tile : !pto.tile_buf) + %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_fp32__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf + pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%cur_li__tile : !pto.tile_buf) + %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + return + } +} From fa55d253f0ff7899ba4f44eda6512dd8ad6453cf Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 16:51:50 +0800 Subject: [PATCH 09/16] test: restore qwen3 tilelet M16 goldens --- test/samples/Qwen3Tilelet/README.md | 12 +- .../qwen3_decode_layer_incore_0.pto | 23 --- .../qwen3_decode_layer_incore_1.pto | 123 +++++++++-- .../qwen3_decode_layer_incore_10.pto | 118 +++++++++-- .../qwen3_decode_layer_incore_10_golden.py | 69 +++++++ .../qwen3_decode_layer_incore_11.pto | 118 ----------- .../qwen3_decode_layer_incore_12.pto | 31 --- .../qwen3_decode_layer_incore_13.pto | 119 +++++++++-- .../qwen3_decode_layer_incore_13_golden.py | 73 +++++++ .../qwen3_decode_layer_incore_14.pto | 126 +++++------- .../qwen3_decode_layer_incore_14_golden.py | 61 ++++++ .../qwen3_decode_layer_incore_15.pto | 47 ----- .../qwen3_decode_layer_incore_16.pto | 49 ----- .../qwen3_decode_layer_incore_17.pto | 104 ---------- .../qwen3_decode_layer_incore_18.pto | 75 ------- .../qwen3_decode_layer_incore_19.pto | 36 ---- .../qwen3_decode_layer_incore_1_golden.py | 77 +++++++ .../qwen3_decode_layer_incore_2.pto | 194 ++++++++++++------ .../qwen3_decode_layer_incore_2_golden.py | 86 ++++++++ .../qwen3_decode_layer_incore_3.pto | 45 ---- .../qwen3_decode_layer_incore_4.pto | 46 ----- .../qwen3_decode_layer_incore_5.pto | 46 ----- .../qwen3_decode_layer_incore_6.pto | 88 -------- .../qwen3_decode_layer_incore_7.pto | 92 --------- .../qwen3_decode_layer_incore_8.pto | 30 --- .../qwen3_decode_layer_incore_9.pto | 49 ----- 26 files changed, 877 insertions(+), 1060 deletions(-) delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto delete mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md index 4f78ed37f..010e75623 100644 --- a/test/samples/Qwen3Tilelet/README.md +++ b/test/samples/Qwen3Tilelet/README.md @@ -1,10 +1,14 @@ Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_tilelet.py`. Scope: -- direct `ptoas` compile-regression inputs +- compile-regression inputs for `ptoas` - A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS` Notes: -- The current tilelet lowering emits 20 kernel fragments (`aiv`, `aic`, and mixed-kernel `.pto` files). This directory vendors those emitted `.pto` inputs directly, flattened into one sample directory for `runop.sh`. -- These files are regenerated from the tilelet example with `BATCH_TILE=16` / M=16 lowering. -- The directory is compile-regression focused; stale custom NPU-validation goldens for the old M=4 split are intentionally dropped here. +- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs: + `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`, + `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`, + `qwen3_decode_layer_incore_14`. +- This sample directory vendors only those direct `ptoas` regression inputs, regenerated from the tilelet source with `BATCH_TILE=16`. +- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation. +- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `_golden.py`. diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto deleted file mode 100644 index 856f60659..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto +++ /dev/null @@ -1,23 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_0(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %cst = arith.constant 0.000000e+00 : f32 - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %attn_out__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %zero_q__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%zero_q__tile : !pto.tile_buf) - %zero_attn__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - pto.tcvt ins(%zero_q__tile{rmode = #pto} : !pto.tile_buf) outs(%zero_attn__tile : !pto.tile_buf) - %q_proj__iter_v1_pview = pto.partition_view %q_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%zero_q__tile : !pto.tile_buf) outs(%q_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) - %attn_out__iter_v1_pview = pto.partition_view %attn_out__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> - pto.tstore ins(%zero_attn__tile : !pto.tile_buf) outs(%attn_out__iter_v1_pview : !pto.partition_tensor_view<16x64xbf16>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto index 2d0902b60..591063f0d 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1.pto @@ -1,23 +1,116 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_1_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c16384 = arith.constant 16384 : i64 %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 %c16 = arith.constant 16 : index - %c1024 = arith.constant 1024 : index + %c5120 = arith.constant 5120 : index %c1 = arith.constant 1 : index - %cst = arith.constant 0.000000e+00 : f32 - %c64 = arith.constant 64 : index %c0 = arith.constant 0 : index - %k_proj__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %zero_k__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%zero_k__tile : !pto.tile_buf) - %zero_v__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%zero_v__tile : !pto.tile_buf) - %k_proj__iter_v1_pview = pto.partition_view %k_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%zero_k__tile : !pto.tile_buf) outs(%k_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) - %v_proj__iter_v1_pview = pto.partition_view %v_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%zero_v__tile : !pto.tile_buf) outs(%v_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_1_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + %qwen3_decode_layer_incore_1_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg6, %c4 : index + %1 = arith.addi %0, %ob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %5 = arith.muli %kb__idx_v0, %c128 : index + %wq_chunk__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wq_chunk__tile : !pto.tile_buf) + %t__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf + %t__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%t__tile_Left_mat : !pto.tile_buf) outs(%t__tile_Left : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} + %wq_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wq_chunk__tile : !pto.tile_buf) outs(%wq_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%t__tile_Left, %wq_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + } + return + } + func.func @qwen3_decode_layer_incore_1_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: index, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c16384 = arith.constant 16384 : i64 + %c16448 = arith.constant 16448 : i64 + %c20544 = arith.constant 20544 : i64 + %c24640 = arith.constant 24640 : i64 + %c32832 = arith.constant 32832 : i64 + %c33344 = arith.constant 33344 : i64 + %c41536 = arith.constant 41536 : i64 + %c45632 = arith.constant 45632 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %q_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wq__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_1_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_1_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_1_aic} -> i32 + %qwen3_decode_layer_incore_1_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_1_c2v_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_1_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_1_v2c_slot_buffer_import : i32) + %inv_rms_tile__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf) + scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { + %5 = arith.muli %arg6, %c4 : index + %6 = arith.addi %5, %ob__ci_idx_v0 : index + %7 = arith.muli %6, %c1 : index + %8 = arith.addi %c0, %7 : index + %9 = arith.muli %8, %c64 : index + %q_acc__tile = pto.alloc_tile addr = %c16448 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf + pto.tmuls ins(%q_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %10 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c20544 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg5, %10], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf) + %x_chunk__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c32832 : !pto.tile_buf + %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %10], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c24640 : !pto.tile_buf + pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf + pto.tcolexpandmul ins(%1, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c20544 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%2 : !pto.tile_buf) + %t__tile_nz = pto.alloc_tile addr = %c33344 : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%t__tile_nz : !pto.tile_buf) + pto.tpush_to_aic(%t__tile_nz : !pto.tile_buf) {split = 0} + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %3 = pto.alloc_tile addr = %c41536 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %q_acc__tile_mv = pto.alloc_tile addr = %c16448 : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%q_acc__tile_mv : !pto.tile_buf) + } + %4 = pto.alloc_tile addr = %c45632 : !pto.tile_buf + pto.tcvt ins(%0{rmode = #pto} : !pto.tile_buf) outs(%4 : !pto.tile_buf) + %q_proj__co_l1_iter_v3_pview = pto.partition_view %q_proj__co_l0_iter_v3_view, offsets = [%arg5, %9], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%4 : !pto.tile_buf) outs(%q_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x64xbf16>) + } return } } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto index bc49f96e4..9e7c5e51a 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10.pto @@ -1,30 +1,106 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_10(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_10_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c16384 = arith.constant 16384 : i64 %c0i = arith.constant 0 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_10_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + %qwen3_decode_layer_incore_10_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %0 = arith.muli %arg5, %c8 : index + %1 = arith.addi %0, %ob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %5 = arith.muli %kb__idx_v0, %c128 : index + %w_chunk__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf) + %a_chunk__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf + %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%a_chunk__tile_Left_mat : !pto.tile_buf) outs(%a_chunk__tile_Left : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} + %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_chunk__tile : !pto.tile_buf) outs(%w_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + } + return + } + func.func @qwen3_decode_layer_incore_10_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c16384 = arith.constant 16384 : i64 + %c20480 = arith.constant 20480 : i64 + %c28672 = arith.constant 28672 : i64 + %c32768 = arith.constant 32768 : i64 + %c40960 = arith.constant 40960 : i64 + %c45056 = arith.constant 45056 : i64 + %c36864 = arith.constant 36864 : i64 %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index + %c5120 = arith.constant 5120 : index %c1 = arith.constant 1 : index - %c524288 = arith.constant 524288 : index - %c128 = arith.constant 128 : index %c0 = arith.constant 0 : index - %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> - pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf) - %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> - pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) - %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) - %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%v_tile__tile : !pto.tile_buf) outs(%v_tile__tile_Right : !pto.tile_buf) - %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%oi_tmp_pad__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>) + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %attn_out__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_10_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_10_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_10_aic} -> i32 + %qwen3_decode_layer_incore_10_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_10_c2v_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_10_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_10_v2c_slot_buffer_import : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %4 = arith.muli %arg5, %c8 : index + %5 = arith.addi %4, %ob__ci_idx_v0 : index + %6 = arith.muli %5, %c1 : index + %7 = arith.addi %c0, %6 : index + %8 = arith.muli %7, %c64 : index + %o_acc__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c16384 : !pto.tile_buf + pto.tmuls ins(%o_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %9 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf + %attn_out__rv_v2_pview = pto.partition_view %attn_out__rv_v2_view, offsets = [%arg4, %9], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%attn_out__rv_v2_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf) + %a_chunk__tile = pto.alloc_tile addr = %c28672 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%a_chunk__tile : !pto.tile_buf) + %a_chunk__tile_nz = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmov ins(%a_chunk__tile : !pto.tile_buf) outs(%a_chunk__tile_nz : !pto.tile_buf) + pto.tpush_to_aic(%a_chunk__tile_nz : !pto.tile_buf) {split = 0} + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %1 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %o_acc__tile_mv = pto.alloc_tile addr = %c16384 : !pto.tile_buf + pto.tmov ins(%1 : !pto.tile_buf) outs(%o_acc__tile_mv : !pto.tile_buf) + } + %2 = pto.alloc_tile addr = %c45056 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %8], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%2 : !pto.tile_buf) + %resid__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.tcvt ins(%2{rmode = #pto} : !pto.tile_buf) outs(%resid__tile : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16384 : !pto.tile_buf + pto.tadd ins(%0, %resid__tile : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + %resid1_tile__co_l1_iter_v1_pview = pto.partition_view %resid1_tile__co_l0_iter_v1_view, offsets = [%c0, %8], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%3 : !pto.tile_buf) outs(%resid1_tile__co_l1_iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) + } return } } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py new file mode 100644 index 000000000..c286795b8 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_10_golden.py @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + rows = 16 + meta = load_case_meta() + generator = rng() + b0, ob = load_int32_assignments()[:2] + + buffers = { + "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.05), + "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.05), + "v3": np.zeros(meta.elem_counts["v3"], dtype=meta.np_types["v3"]), + "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.05), + } + + output = np.zeros_like(buffers["v3"]) + + for ob_ci in range(8): + o0 = (ob * 8 + ob_ci) * 64 + acc = np.zeros((rows, 64), dtype=np.float32) + for kb in range(40): + k0 = kb * 128 + attn_chunk = load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=rows, cols=128, row_stride=5120) + attn_chunk = bf16_to_float32(float32_to_bf16(attn_chunk)) + w_chunk = bf16_to_float32( + load_strided_2d(buffers["v4"], offset=k0 * 5120 + o0, rows=128, cols=64, row_stride=5120) + ) + acc += attn_chunk @ w_chunk + resid = bf16_to_float32( + load_strided_2d(buffers["v2"], offset=b0 * 5120 + o0, rows=rows, cols=64, row_stride=5120) + ) + output = store_strided_2d(output, acc + resid, offset=o0, row_stride=5120) + + write_buffers(meta, buffers) + write_golden(meta, {"v3": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto deleted file mode 100644 index 9a8a29a01..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto +++ /dev/null @@ -1,118 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_11(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c32 = arith.constant 32 : i64 - %c64 = arith.constant 64 : i64 - %c96 = arith.constant 96 : i64 - %c128 = arith.constant 128 : i64 - %c4224 = arith.constant 4224 : i64 - %c8320 = arith.constant 8320 : i64 - %c12416 = arith.constant 12416 : i64 - %c12448 = arith.constant 12448 : i64 - %c12480 = arith.constant 12480 : i64 - %c12512 = arith.constant 12512 : i64 - %c12544 = arith.constant 12544 : i64 - %c12576 = arith.constant 12576 : i64 - %c12608 = arith.constant 12608 : i64 - %c8 = arith.constant 8 : index - %c1 = arith.constant 1 : index - %7 = arith.constant 128 : index - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %li__phi_v5 = pto.alloc_tile addr = %c12416 : !pto.tile_buf - %mi__phi_v5 = pto.alloc_tile addr = %c12448 : !pto.tile_buf - %oi__phi_v5 = pto.alloc_tile addr = %c8320 : !pto.tile_buf - %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf) - %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) - %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) - %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf - %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf) - %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf - %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) - %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf - %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) - %8 = arith.cmpi eq, %arg9, %c0 : index - scf.if %8 { - %oi__ssa_v3 = pto.alloc_tile addr = %c8320 : !pto.tile_buf - %li__ssa_v3 = pto.alloc_tile addr = %c12416 : !pto.tile_buf - %mi__ssa_v3 = pto.alloc_tile addr = %c12448 : !pto.tile_buf - pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) - pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) - pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) - } else { - %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf - %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf - %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c12480 : !pto.tile_buf - pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) - %mi_new__tile = pto.alloc_tile addr = %c12480 : !pto.tile_buf - %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf - %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c12480 : !pto.tile_buf - %t__row_major_tmp_v5 = pto.alloc_tile addr = %c12512 : !pto.tile_buf - pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf - %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c12512 : !pto.tile_buf - %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c12512 : !pto.tile_buf - pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) - %alpha__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf - %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf - %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c12480 : !pto.tile_buf - %t__row_major_tmp_v10 = pto.alloc_tile addr = %c12544 : !pto.tile_buf - pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c12544 : !pto.tile_buf - %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c12544 : !pto.tile_buf - %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c12544 : !pto.tile_buf - pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) - %beta__tile = pto.alloc_tile addr = %c12544 : !pto.tile_buf - %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c12512 : !pto.tile_buf - %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf - %t__row_major_tmp_v15 = pto.alloc_tile addr = %c12576 : !pto.tile_buf - pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c12576 : !pto.tile_buf - %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c12544 : !pto.tile_buf - %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf - %t__row_major_tmp_v18 = pto.alloc_tile addr = %c12608 : !pto.tile_buf - pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c12608 : !pto.tile_buf - %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c12576 : !pto.tile_buf - %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c12608 : !pto.tile_buf - %li__row_major_tmp_v21 = pto.alloc_tile addr = %c12576 : !pto.tile_buf - pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) - %3 = pto.alloc_tile addr = %c12576 : !pto.tile_buf - %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf - pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) - %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf - pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) - %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf - pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) - %mi__ssa_v4 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) - pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) - pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) - } - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto deleted file mode 100644 index a9c4f9bee..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto +++ /dev/null @@ -1,31 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_12(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c32 = arith.constant 32 : i64 - %c4128 = arith.constant 4128 : i64 - %c1 = arith.constant 1 : index - %c8192 = arith.constant 8192 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) - %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) - %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf, !pto.tile_buf) outs(%ctx__tile : !pto.tile_buf) - %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf - %ctx_flat_bf16__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf - pto.tcvt ins(%ctx_flat__tile{rmode = #pto} : !pto.tile_buf) outs(%ctx_flat_bf16__tile : !pto.tile_buf) - %0 = arith.muli %arg3, %c128 : index - %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view -> !pto.partition_tensor_view<1x1024xbf16> - pto.tstore ins(%ctx_flat_bf16__tile : !pto.tile_buf) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xbf16>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto index 8b38aaf7e..a93fb36df 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13.pto @@ -1,21 +1,116 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_13(%arg0: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_13_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c20480 = arith.constant 20480 : i64 %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index + %c64 = arith.constant 64 : index %c1 = arith.constant 1 : index + %c5120 = arith.constant 5120 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c64 = arith.constant 64 : index - %cst = arith.constant 0.000000e+00 : f32 - %resid1_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - scf.for %ob__idx_v0 = %c0 to %c128 step %c1 { - %0 = arith.muli %ob__idx_v0, %c64 : index - %zero_resid1__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%zero_resid1__tile : !pto.tile_buf) - %resid1_tile__iter_v1_pview = pto.partition_view %resid1_tile__ssa_v0_view, offsets = [%c0, %0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%zero_resid1__tile : !pto.tile_buf) outs(%resid1_tile__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_13_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_13_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf) + %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf) + %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf + %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg6], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf) + %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%post_chunk__tile : !pto.tile_buf) outs(%post_chunk__tile_Left : !pto.tile_buf) + %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wg__tile : !pto.tile_buf) outs(%wg__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wu__tile : !pto.tile_buf) outs(%wu__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} } return } + func.func @qwen3_decode_layer_incore_13_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c32768 = arith.constant 32768 : i64 + %c36864 = arith.constant 36864 : i64 + %c45056 = arith.constant 45056 : i64 + %c49152 = arith.constant 49152 : i64 + %c40960 = arith.constant 40960 : i64 + %c53248 = arith.constant 53248 : i64 + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c5120 = arith.constant 5120 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %cst_1 = arith.constant 1.000000e+00 : f32 + %gate_acc__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %up_acc__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c5120, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg5, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_13_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_13_c2v_slot_buffer", size = 32768, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_13_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + %gate_acc__ssa_v0_pview = pto.partition_view %gate_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tload ins(%gate_acc__ssa_v0_pview : !pto.partition_tensor_view<16x64xf32>) outs(%gate_acc__tile : !pto.tile_buf) + %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf + %up_acc__ssa_v0_pview = pto.partition_view %up_acc__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tload ins(%up_acc__ssa_v0_pview : !pto.partition_tensor_view<16x64xf32>) outs(%up_acc__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmuls ins(%gate_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.tmuls ins(%up_acc__tile, %cst : !pto.tile_buf, f32) outs(%1 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %2 = pto.alloc_tile addr = %c45056 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %3 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %4 = pto.alloc_tile addr = %c49152 : !pto.tile_buf + pto.tadd ins(%1, %3 : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%gate_acc__tile_mv : !pto.tile_buf) + %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.tmov ins(%4 : !pto.tile_buf) outs(%up_acc__tile_mv : !pto.tile_buf) + } + %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tneg ins(%0 : !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tadds ins(%5, %cst_1 : !pto.tile_buf, f32) outs(%6 : !pto.tile_buf) + %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf + pto.trecip ins(%6 : !pto.tile_buf) outs(%sigmoid__tile : !pto.tile_buf) + %7 = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmul ins(%0, %sigmoid__tile : !pto.tile_buf, !pto.tile_buf) outs(%7 : !pto.tile_buf) + %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmul ins(%7, %1 : !pto.tile_buf, !pto.tile_buf) outs(%mlp_chunk__tile : !pto.tile_buf) + %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf + pto.tcvt ins(%mlp_chunk__tile{rmode = #pto} : !pto.tile_buf) outs(%mlp_chunk_bf16__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>) + return + } } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py new file mode 100644 index 000000000..61c671b01 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_13_golden.py @@ -0,0 +1,73 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + rows = 16 + meta = load_case_meta() + generator = rng() + o0 = load_int32_assignments()[0] + + buffers = { + "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01), + "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01), + "v3": make_fp32(generator, meta.elem_counts["v3"], scale=0.01), + "v4": make_bf16(generator, meta.elem_counts["v4"], scale=0.01), + "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.01), + "v6": np.zeros(meta.elem_counts["v6"], dtype=meta.np_types["v6"]), + } + + gate_acc = np.zeros((rows, 64), dtype=np.float32) + up_acc = np.zeros((rows, 64), dtype=np.float32) + + for kb in range(40): + k0 = kb * 128 + post_chunk = bf16_to_float32( + load_strided_2d(buffers["v2"], offset=k0, rows=rows, cols=128, row_stride=5120) + ) + w_gate = bf16_to_float32( + load_strided_2d(buffers["v4"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600) + ) + w_up = bf16_to_float32( + load_strided_2d(buffers["v5"], offset=k0 * 25600 + o0, rows=128, cols=64, row_stride=25600) + ) + gate_acc += post_chunk @ w_gate + up_acc += post_chunk @ w_up + + sigmoid = np.reciprocal(1.0 + np.exp(-gate_acc)) + mlp_chunk = gate_acc * sigmoid * up_acc + output = float32_to_bf16(mlp_chunk) + + write_buffers(meta, buffers) + write_golden(meta, {"v6": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto index 0a0172824..5c06dda13 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14.pto @@ -1,90 +1,74 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_14_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 + %c16384 = arith.constant 16384 : i64 %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index + %c5120 = arith.constant 5120 : index %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view %qwen3_decode_layer_incore_14_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_14_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { - %0 = arith.muli %arg5, %c8 : index - %1 = arith.addi %0, %ob__ci_idx_v0 : index + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index %2 = arith.muli %1, %c1 : index %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c64 : index - scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { - %5 = arith.muli %kb__idx_v0, %c128 : index - %a_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %attn_out__rv_v5_pview = pto.partition_view %attn_out__rv_v5_view, offsets = [%arg4, %5], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%attn_out__rv_v5_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%a_chunk__tile : !pto.tile_buf) - %w_chunk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wo__ssa_v0_pview = pto.partition_view %wo__ssa_v0_view, offsets = [%5, %4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wo__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%w_chunk__tile : !pto.tile_buf) - %a_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%a_chunk__tile : !pto.tile_buf) outs(%a_chunk__tile_Left : !pto.tile_buf) - %w_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%w_chunk__tile : !pto.tile_buf) outs(%w_chunk__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%a_chunk__tile_Left, %w_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - } + %4 = arith.muli %3, %c128 : index + %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> + pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf) outs(%w_down_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} } return } - func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c32768 = arith.constant 32768 : i64 - %c40960 = arith.constant 40960 : i64 - %c45056 = arith.constant 45056 : i64 - %c36864 = arith.constant 36864 : i64 + func.func @qwen3_decode_layer_incore_14_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c65536 = arith.constant 65536 : i64 %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index + %c5120 = arith.constant 5120 : index %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %cst = arith.constant 0.000000e+00 : f32 - %attn_out__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %resid1_tile__co_l0_iter_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %wo__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 32768, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { - %2 = arith.muli %arg5, %c8 : index - %3 = arith.addi %2, %ob__ci_idx_v0 : index - %4 = arith.muli %3, %c1 : index - %5 = arith.addi %c0, %4 : index - %6 = arith.muli %5, %c64 : index - %o_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%o_acc__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %0 = pto.alloc_tile addr = %c40960 : !pto.tile_buf - pto.tadd ins(%o_acc__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %o_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.tmov ins(%0 : !pto.tile_buf) outs(%o_acc__tile_mv : !pto.tile_buf) - } - %t__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf - %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg4, %6], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> - pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%t__tile : !pto.tile_buf) - %resid__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%resid__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.tadd ins(%o_acc__tile, %resid__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %resid1_tile__co_l1_iter_v4_pview = pto.partition_view %resid1_tile__co_l0_iter_v4_view, offsets = [%c0, %6], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%1 : !pto.tile_buf) outs(%resid1_tile__co_l1_iter_v4_pview : !pto.partition_tensor_view<16x64xf32>) + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_14_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_14_c2v_slot_buffer", size = 65536, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_14_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf + %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf) + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf + pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%down_next__tile : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%down_next__tile : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<16x128xf32>) } return } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py new file mode 100644 index 000000000..9994d6990 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_14_golden.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.01) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def main(): + rows = 16 + meta = load_case_meta() + generator = rng() + dob, o0 = load_int32_assignments()[:2] + + buffers = { + "v1": make_fp32(generator, meta.elem_counts["v1"], scale=0.01), + "v2": make_bf16(generator, meta.elem_counts["v2"], scale=0.01), + "v3": make_bf16(generator, meta.elem_counts["v3"], scale=0.01), + } + + output = np.array(buffers["v1"], copy=True) + mlp_chunk = bf16_to_float32(load_strided_2d(buffers["v2"], offset=0, rows=rows, cols=64, row_stride=64)) + + for dob_ci in range(4): + d0 = (dob * 4 + dob_ci) * 128 + down_prev = load_strided_2d(output, offset=d0, rows=rows, cols=128, row_stride=5120).astype(np.float32) + w_down = bf16_to_float32( + load_strided_2d(buffers["v3"], offset=o0 * 5120 + d0, rows=64, cols=128, row_stride=5120) + ) + output = store_strided_2d(output, down_prev + mlp_chunk @ w_down, offset=d0, row_stride=5120) + + write_buffers(meta, buffers) + write_golden(meta, {"v1": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto deleted file mode 100644 index a45c9a509..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto +++ /dev/null @@ -1,47 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_15(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c64 = arith.constant 64 : i64 - %c8256 = arith.constant 8256 : i64 - %c16448 = arith.constant 16448 : i64 - %c16512 = arith.constant 16512 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %5 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %cst_1 = arith.constant 1.220703e-04 : f32 - %cst_2 = arith.constant 1.000000e-06 : f32 - %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout}: !pto.tensor_view - %sq_sum__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%sq_sum__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %5 step %c1 { - %6 = arith.muli %kb__idx_v0, %c128 : index - %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %6], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %tmp_tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf - %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf - pto.trowsum ins(%t__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c16448 : !pto.tile_buf - %2 = pto.alloc_tile addr = %c16512 : !pto.tile_buf - pto.tadd ins(%sq_sum__tile, %1 : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - %sq_sum__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%2 : !pto.tile_buf) outs(%sq_sum__tile_mv : !pto.tile_buf) - } - %3 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmuls ins(%sq_sum__tile, %cst_1 : !pto.tile_buf, f32) outs(%3 : !pto.tile_buf) - %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tadds ins(%3, %cst_2 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) - %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.trsqrt ins(%4 : !pto.tile_buf) outs(%inv_rms__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> - pto.tstore ins(%inv_rms__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<1x16xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto deleted file mode 100644 index f9fa660d1..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_16(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c64 = arith.constant 64 : i64 - %c8256 = arith.constant 8256 : i64 - %c8768 = arith.constant 8768 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %2 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %cst = arith.constant 0.000000e+00 : f32 - %down_proj_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %inv_rms__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout}: !pto.tensor_view - %post_norm_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %post_rms_weight__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg4, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %inv_rms__ssa_v0_pview = pto.partition_view %inv_rms__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> - pto.tload ins(%inv_rms__ssa_v0_pview : !pto.partition_tensor_view<1x16xf32>) outs(%inv_rms__tile : !pto.tile_buf) - scf.for %zi__idx_v0 = %c0 to %2 step %c1 { - %3 = arith.muli %zi__idx_v0, %c128 : index - %down_zero_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%down_zero_chunk__tile : !pto.tile_buf) - %down_proj_tile__iter_v1_pview = pto.partition_view %down_proj_tile__ssa_v0_view, offsets = [%c0, %3], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tstore ins(%down_zero_chunk__tile : !pto.tile_buf) outs(%down_proj_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) - } - scf.for %kb__idx_v0 = %c0 to %2 step %c1 { - %4 = arith.muli %kb__idx_v0, %c128 : index - %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf) - %gamma__tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf - %post_rms_weight__ssa_v0_pview = pto.partition_view %post_rms_weight__ssa_v0_view, offsets = [%c0, %4], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tload ins(%post_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %0 = pto.alloc_tile addr = %c64 : !pto.tile_buf - pto.trowexpandmul ins(%x_chunk__tile, %t__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %normed__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - pto.tcolexpandmul ins(%0, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c8768 : !pto.tile_buf - pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%1 : !pto.tile_buf) - %post_norm_tile__iter_v1_pview = pto.partition_view %post_norm_tile__ssa_v0_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tstore ins(%1 : !pto.tile_buf) outs(%post_norm_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>) - } - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto deleted file mode 100644 index ae6570c56..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto +++ /dev/null @@ -1,104 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_17_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 - %c20480 = arith.constant 20480 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c25600 = arith.constant 25600 : index - %c64 = arith.constant 64 : index - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_17_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_17_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { - %1 = arith.muli %kb__idx_v0, %c128 : index - %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf) - %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf) - %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf - %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf) - %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%post_chunk__tile : !pto.tile_buf) outs(%post_chunk__tile_Left : !pto.tile_buf) - %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wg__tile : !pto.tile_buf) outs(%wg__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%wu__tile : !pto.tile_buf) outs(%wu__tile_Right : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} - } - return - } - func.func @qwen3_decode_layer_incore_17_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c32768 = arith.constant 32768 : i64 - %c36864 = arith.constant 36864 : i64 - %c45056 = arith.constant 45056 : i64 - %c49152 = arith.constant 49152 : i64 - %c40960 = arith.constant 40960 : i64 - %c53248 = arith.constant 53248 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c25600 = arith.constant 25600 : index - %c64 = arith.constant 64 : index - %c0_i32 = arith.constant 0 : i32 - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %cst_1 = arith.constant 1.000000e+00 : f32 - %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_17_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", size = 32768, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) - %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%gate_acc__tile : !pto.tile_buf) - %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%up_acc__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %0 = pto.alloc_tile addr = %c45056 : !pto.tile_buf - pto.tadd ins(%gate_acc__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %1 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %2 = pto.alloc_tile addr = %c49152 : !pto.tile_buf - pto.tadd ins(%up_acc__tile, %1 : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.tmov ins(%0 : !pto.tile_buf) outs(%gate_acc__tile_mv : !pto.tile_buf) - %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf - pto.tmov ins(%2 : !pto.tile_buf) outs(%up_acc__tile_mv : !pto.tile_buf) - } - %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf - pto.tneg ins(%gate_acc__tile : !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %3 = pto.alloc_tile addr = %c40960 : !pto.tile_buf - pto.texp ins(%t__tile : !pto.tile_buf) outs(%3 : !pto.tile_buf) - %4 = pto.alloc_tile addr = %c40960 : !pto.tile_buf - pto.tadds ins(%3, %cst_1 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) - %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf - pto.trecip ins(%4 : !pto.tile_buf) outs(%sigmoid__tile : !pto.tile_buf) - %5 = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.tmul ins(%gate_acc__tile, %sigmoid__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) - %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf - pto.tmul ins(%5, %up_acc__tile : !pto.tile_buf, !pto.tile_buf) outs(%mlp_chunk__tile : !pto.tile_buf) - %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf - pto.tcvt ins(%mlp_chunk__tile{rmode = #pto} : !pto.tile_buf) outs(%mlp_chunk_bf16__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> - pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto deleted file mode 100644 index 3228a9f80..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto +++ /dev/null @@ -1,75 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_18_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c16384 = arith.constant 16384 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c25600 = arith.constant 25600 : index - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_18_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_18_aiv} -> i32 - pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { - %0 = arith.muli %arg3, %c4 : index - %1 = arith.addi %0, %dob__ci_idx_v0 : index - %2 = arith.muli %1, %c1 : index - %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c128 : index - %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> - pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf) - %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> - pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) - %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) - %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf) outs(%w_down_chunk__tile_Right : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} - } - return - } - func.func @qwen3_decode_layer_incore_18_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c65536 = arith.constant 65536 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c25600 = arith.constant 25600 : index - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %qwen3_decode_layer_incore_18_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", size = 65536, location = #pto.address_space, auto = false, base = 0} -> i32 - pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) - scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { - %0 = arith.muli %arg3, %c4 : index - %1 = arith.addi %0, %dob__ci_idx_v0 : index - %2 = arith.muli %1, %c1 : index - %3 = arith.addi %c0, %2 : index - %4 = arith.muli %3, %c128 : index - %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf - %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf) - %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf - pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%down_next__tile : !pto.tile_buf) - pto.tfree_from_aic {split = 0} - %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tstore ins(%down_next__tile : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<16x128xf32>) - } - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto deleted file mode 100644 index 776c7aed2..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto +++ /dev/null @@ -1,36 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_19(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c8192 = arith.constant 8192 : i64 - %c16384 = arith.constant 16384 : i64 - %c16 = arith.constant 16 : index - %2 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %down_proj_tile__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view - %out__co_l0_iter_v3_view = pto.make_tensor_view %arg1, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view - %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view - scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { - %3 = arith.muli %arg4, %c4 : index - %4 = arith.addi %3, %ob__ci_idx_v0 : index - %5 = arith.muli %4, %c1 : index - %6 = arith.addi %c0, %5 : index - %7 = arith.muli %6, %c128 : index - %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %down_proj_tile__rv_v5_pview = pto.partition_view %down_proj_tile__rv_v5_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tload ins(%down_proj_tile__rv_v5_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf - %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> - pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%0 : !pto.tile_buf) - %down_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tadd ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%down_acc__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c16384 : !pto.tile_buf - pto.tcvt ins(%down_acc__tile{rmode = #pto} : !pto.tile_buf) outs(%1 : !pto.tile_buf) - %out__co_l1_iter_v3_pview = pto.partition_view %out__co_l0_iter_v3_view, offsets = [%arg3, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tstore ins(%1 : !pto.tile_buf) outs(%out__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x128xbf16>) - } - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py new file mode 100644 index 000000000..0952f032b --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_1_golden.py @@ -0,0 +1,77 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray: + if positive: + return generator.uniform(0.5, 1.5, size=count).astype(np.float32) + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray: + return bf16_to_float32(float32_to_bf16(values)) + + +def main(): + rows = 16 + meta = load_case_meta() + generator = rng() + b0, ob = load_int32_assignments()[:2] + + buffers = { + "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05), + "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True), + "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True), + "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]), + "v5": make_bf16(generator, meta.elem_counts["v5"], scale=0.05), + } + + inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(rows, 1) + output = np.zeros_like(buffers["v4"]) + + for ob_ci in range(4): + q0 = (ob * 4 + ob_ci) * 64 + acc = np.zeros((rows, 64), dtype=np.float32) + for kb in range(40): + k0 = kb * 128 + x_chunk = bf16_to_float32( + load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=rows, cols=128, row_stride=5120) + ) + gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) + normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma) + w_chunk = bf16_to_float32( + load_strided_2d(buffers["v5"], offset=k0 * 5120 + q0, rows=128, cols=64, row_stride=5120) + ) + acc += normed @ w_chunk + output = store_strided_2d(output, float32_to_bf16(acc), offset=b0 * 5120 + q0, row_stride=5120) + + write_buffers(meta, buffers) + write_golden(meta, {"v4": output}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto index 9fbf4425d..795b5dee7 100644 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2.pto @@ -1,67 +1,145 @@ module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_2(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @qwen3_decode_layer_incore_2_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c16384 = arith.constant 16384 : i64 + %c32768 = arith.constant 32768 : i64 %c0i = arith.constant 0 : i64 - %c64 = arith.constant 64 : i64 - %c4160 = arith.constant 4160 : i64 - %c12352 = arith.constant 12352 : i64 - %c20544 = arith.constant 20544 : i64 - %c20608 = arith.constant 20608 : i64 - %c20672 = arith.constant 20672 : i64 %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index + %c5120 = arith.constant 5120 : index %c1 = arith.constant 1 : index - %cst = arith.constant 0.000000e+00 : f32 + %c1024 = arith.constant 1024 : index %c0 = arith.constant 0 : index - %10 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %cst_1 = arith.constant 1.220703e-04 : f32 - %cst_2 = arith.constant 1.000000e-06 : f32 - %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %normed_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %partial_sq__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%partial_sq__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c0 to %10 step %c1 { - %11 = arith.muli %kb__idx_v0, %c128 : index - %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf - %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %11], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf) - %x_chunk__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf - pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c4160 : !pto.tile_buf - pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %tmp_tile = pto.alloc_tile addr = %c12352 : !pto.tile_buf - %1 = pto.alloc_tile addr = %c20544 : !pto.tile_buf - pto.trowsum ins(%0, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c20544 : !pto.tile_buf - %3 = pto.alloc_tile addr = %c20608 : !pto.tile_buf - pto.tadd ins(%partial_sq__tile, %2 : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) - %partial_sq__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%3 : !pto.tile_buf) outs(%partial_sq__tile_mv : !pto.tile_buf) + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_2_v2c_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + %qwen3_decode_layer_incore_2_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer_import : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer : i32) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %1 = arith.muli %arg8, %c8 : index + %2 = arith.addi %1, %ob__ci_idx_v0 : index + %3 = arith.muli %2, %c1 : index + %4 = arith.addi %c0, %3 : index + %5 = arith.muli %4, %c64 : index + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %6 = arith.muli %kb__idx_v0, %c128 : index + %wk_chunk__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wk_chunk__tile : !pto.tile_buf) + %wv_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%6, %5], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wv_chunk__tile : !pto.tile_buf) + %normed_bf16__tile_Left_mat = pto.tpop_from_aiv {split = 0} -> !pto.tile_buf + %normed_bf16__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%normed_bf16__tile_Left_mat : !pto.tile_buf) outs(%normed_bf16__tile_Left : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} + %wk_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wk_chunk__tile : !pto.tile_buf) outs(%wk_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%normed_bf16__tile_Left, %wk_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %wv_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wv_chunk__tile : !pto.tile_buf) outs(%wv_chunk__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%normed_bf16__tile_Left, %wv_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} + } + } + return } - %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmuls ins(%partial_sq__tile, %cst_1 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) - %5 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tadds ins(%4, %cst_2 : !pto.tile_buf, f32) outs(%5 : !pto.tile_buf) - %variance__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - scf.for %12 = %c0 to %10 step %c1 { - %13 = arith.muli %12, %c128 : index - %6 = pto.alloc_tile addr = %c64 : !pto.tile_buf - %14 = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg3, %13], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%14 : !pto.partition_tensor_view<16x128xbf16>) outs(%6 : !pto.tile_buf) - %7 = pto.alloc_tile addr = %c4160 : !pto.tile_buf - pto.tcvt ins(%6{rmode = #pto} : !pto.tile_buf) outs(%7 : !pto.tile_buf) - %gamma__tile = pto.alloc_tile addr = %c20672 : !pto.tile_buf - %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) - %8 = pto.alloc_tile addr = %c4160 : !pto.tile_buf - pto.trowexpandmul ins(%7, %variance__tile : !pto.tile_buf, !pto.tile_buf) outs(%8 : !pto.tile_buf) - %normed__tile = pto.alloc_tile addr = %c4160 : !pto.tile_buf - pto.tcolexpandmul ins(%8, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) - %9 = pto.alloc_tile addr = %c64 : !pto.tile_buf - pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%9 : !pto.tile_buf) - %normed_tile__iter_v1_pview = pto.partition_view %normed_tile__ssa_v0_view, offsets = [%c0, %13], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tstore ins(%9 : !pto.tile_buf) outs(%normed_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>) + func.func @qwen3_decode_layer_incore_2_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: index, %arg8: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c16384 = arith.constant 16384 : i64 + %c16448 = arith.constant 16448 : i64 + %c20544 = arith.constant 20544 : i64 + %c24640 = arith.constant 24640 : i64 + %c28736 = arith.constant 28736 : i64 + %c36928 = arith.constant 36928 : i64 + %c37440 = arith.constant 37440 : i64 + %c45632 = arith.constant 45632 : i64 + %c49728 = arith.constant 49728 : i64 + %c53824 = arith.constant 53824 : i64 + %c16 = arith.constant 16 : index + %c5120 = arith.constant 5120 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %c40 = arith.constant 40 : index + %c128 = arith.constant 128 : index + %hidden_states__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %input_rms_weight__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c5120], strides = [%c5120, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %k_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__co_l0_iter_v3_view = pto.make_tensor_view %arg4, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wk__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wv__ssa_v0_view = pto.make_tensor_view %arg6, shape = [%c5120, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_2_v2c_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_2_v2c_slot_buffer", peer_func = @qwen3_decode_layer_incore_2_aic} -> i32 + %qwen3_decode_layer_incore_2_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_2_c2v_slot_buffer", size = 16384, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_2_c2v_slot_buffer : i32, v2c_consumer_buf = %qwen3_decode_layer_incore_2_v2c_slot_buffer_import : i32) + %inv_rms_tile__tile = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %inv_rms_tile__ssa_v0_pview = pto.partition_view %inv_rms_tile__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%inv_rms_tile__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%inv_rms_tile__tile : !pto.tile_buf) + scf.for %ob__ci_idx_v0 = %c0 to %c8 step %c1 { + %8 = arith.muli %arg8, %c8 : index + %9 = arith.addi %8, %ob__ci_idx_v0 : index + %10 = arith.muli %9, %c1 : index + %11 = arith.addi %c0, %10 : index + %12 = arith.muli %11, %c64 : index + %k_acc__tile = pto.alloc_tile addr = %c16448 : !pto.tile_buf + %v_acc__tile = pto.alloc_tile addr = %c20544 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf + pto.tmuls ins(%k_acc__tile, %cst : !pto.tile_buf, f32) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c20544 : !pto.tile_buf + pto.tmuls ins(%v_acc__tile, %cst : !pto.tile_buf, f32) outs(%1 : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c40 step %c1 { + %13 = arith.muli %kb__idx_v0, %c128 : index + %t__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf + %hidden_states__ssa_v0_pview = pto.partition_view %hidden_states__ssa_v0_view, offsets = [%arg7, %13], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%hidden_states__ssa_v0_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%t__tile : !pto.tile_buf) + %x_chunk__tile = pto.alloc_tile addr = %c28736 : !pto.tile_buf + pto.tcvt ins(%t__tile{rmode = #pto} : !pto.tile_buf) outs(%x_chunk__tile : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c36928 : !pto.tile_buf + %input_rms_weight__ssa_v0_pview = pto.partition_view %input_rms_weight__ssa_v0_view, offsets = [%c0, %13], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%input_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c28736 : !pto.tile_buf + pto.trowexpandmul ins(%x_chunk__tile, %inv_rms_tile__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c28736 : !pto.tile_buf + pto.tcolexpandmul ins(%2, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %normed_bf16__tile = pto.alloc_tile addr = %c24640 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%normed_bf16__tile : !pto.tile_buf) + %normed_bf16__tile_nz = pto.alloc_tile addr = %c37440 : !pto.tile_buf + pto.tmov ins(%normed_bf16__tile : !pto.tile_buf) outs(%normed_bf16__tile_nz : !pto.tile_buf) + pto.tpush_to_aic(%normed_bf16__tile_nz : !pto.tile_buf) {split = 0} + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %3 = pto.alloc_tile addr = %c45632 : !pto.tile_buf + pto.tadd ins(%0, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %4 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %5 = pto.alloc_tile addr = %c49728 : !pto.tile_buf + pto.tadd ins(%1, %4 : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %k_acc__tile_mv = pto.alloc_tile addr = %c16448 : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%k_acc__tile_mv : !pto.tile_buf) + %v_acc__tile_mv = pto.alloc_tile addr = %c20544 : !pto.tile_buf + pto.tmov ins(%5 : !pto.tile_buf) outs(%v_acc__tile_mv : !pto.tile_buf) + } + %6 = pto.alloc_tile addr = %c53824 : !pto.tile_buf + pto.tcvt ins(%0{rmode = #pto} : !pto.tile_buf) outs(%6 : !pto.tile_buf) + %k_proj__co_l1_iter_v3_pview = pto.partition_view %k_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%6 : !pto.tile_buf) outs(%k_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x64xbf16>) + %7 = pto.alloc_tile addr = %c53824 : !pto.tile_buf + pto.tcvt ins(%1{rmode = #pto} : !pto.tile_buf) outs(%7 : !pto.tile_buf) + %v_proj__co_l1_iter_v3_pview = pto.partition_view %v_proj__co_l0_iter_v3_view, offsets = [%arg7, %12], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%7 : !pto.tile_buf) outs(%v_proj__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x64xbf16>) } return } diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py new file mode 100644 index 000000000..a7ffaa1e0 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_2_golden.py @@ -0,0 +1,86 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + bf16_to_float32, + float32_to_bf16, + load_case_meta, + load_int32_assignments, + load_strided_2d, + rng, + store_strided_2d, + write_buffers, + write_golden, +) + + +def make_fp32(generator, count: int, *, scale: float = 0.05, positive: bool = False) -> np.ndarray: + if positive: + return generator.uniform(0.5, 1.5, size=count).astype(np.float32) + return generator.uniform(-scale, scale, size=count).astype(np.float32) + + +def make_bf16(generator, count: int, *, scale: float = 0.05) -> np.ndarray: + return float32_to_bf16(make_fp32(generator, count, scale=scale)) + + +def round_fp32_to_bf16_fp32(values: np.ndarray) -> np.ndarray: + return bf16_to_float32(float32_to_bf16(values)) + + +def main(): + rows = 16 + meta = load_case_meta() + generator = rng() + b0, ob = load_int32_assignments()[:2] + + buffers = { + "v1": make_bf16(generator, meta.elem_counts["v1"], scale=0.05), + "v2": make_fp32(generator, meta.elem_counts["v2"], positive=True), + "v3": make_fp32(generator, meta.elem_counts["v3"], positive=True), + "v4": np.zeros(meta.elem_counts["v4"], dtype=meta.np_types["v4"]), + "v5": np.zeros(meta.elem_counts["v5"], dtype=meta.np_types["v5"]), + "v6": make_bf16(generator, meta.elem_counts["v6"], scale=0.05), + "v7": make_bf16(generator, meta.elem_counts["v7"], scale=0.05), + } + + inv_rms = np.asarray(buffers["v3"], dtype=np.float32).reshape(rows, 1) + k_proj = np.zeros_like(buffers["v4"]) + v_proj = np.zeros_like(buffers["v5"]) + + for ob_ci in range(8): + kv0 = (ob * 8 + ob_ci) * 64 + k_acc = np.zeros((rows, 64), dtype=np.float32) + v_acc = np.zeros((rows, 64), dtype=np.float32) + for kb in range(40): + k0 = kb * 128 + x_chunk = bf16_to_float32( + load_strided_2d(buffers["v1"], offset=b0 * 5120 + k0, rows=rows, cols=128, row_stride=5120) + ) + gamma = load_strided_2d(buffers["v2"], offset=k0, rows=1, cols=128, row_stride=5120).astype(np.float32) + normed = round_fp32_to_bf16_fp32(x_chunk * inv_rms * gamma) + wk_chunk = bf16_to_float32( + load_strided_2d(buffers["v6"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) + ) + wv_chunk = bf16_to_float32( + load_strided_2d(buffers["v7"], offset=k0 * 1024 + kv0, rows=128, cols=64, row_stride=1024) + ) + k_acc += normed @ wk_chunk + v_acc += normed @ wv_chunk + k_proj = store_strided_2d(k_proj, float32_to_bf16(k_acc), offset=b0 * 1024 + kv0, row_stride=1024) + v_proj = store_strided_2d(v_proj, float32_to_bf16(v_acc), offset=b0 * 1024 + kv0, row_stride=1024) + + write_buffers(meta, buffers) + write_golden(meta, {"v4": k_proj, "v5": v_proj}) + + +if __name__ == "__main__": + main() diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto deleted file mode 100644 index f8bccc6ac..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto +++ /dev/null @@ -1,45 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_3(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c64 = arith.constant 64 : index - %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %wq__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) - %tile_b__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b__tile : !pto.tile_buf) - %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) - %tile_b__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_b__tile : !pto.tile_buf) outs(%tile_b__tile_Right : !pto.tile_buf) - %q_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%tile_a__tile_Left, %tile_b__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%q_acc__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { - %1 = arith.muli %kb__idx_v0, %c128 : index - %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) - %tile_b_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %3 = pto.partition_view %wq__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b_i__tile : !pto.tile_buf) - %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) - %tile_b_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_b_i__tile : !pto.tile_buf) outs(%tile_b_i__tile_Right : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_b_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - } - %q_proj__iter_v6_pview = pto.partition_view %q_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%q_acc__tile : !pto.tile_buf) outs(%q_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto deleted file mode 100644 index 9a2756f1c..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto +++ /dev/null @@ -1,46 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_4(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 - %c16 = arith.constant 16 : index - %c1024 = arith.constant 1024 : index - %c1 = arith.constant 1 : index - %c8192 = arith.constant 8192 : index - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c64 = arith.constant 64 : index - %k_proj__iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %normed_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %wk__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) - %tile_wk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk__tile : !pto.tile_buf) - %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) - %tile_wk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_wk__tile : !pto.tile_buf) outs(%tile_wk__tile_Right : !pto.tile_buf) - %k_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%tile_a__tile_Left, %tile_wk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%k_acc__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { - %1 = arith.muli %kb__idx_v0, %c128 : index - %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) - %tile_wk_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %3 = pto.partition_view %wk__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk_i__tile : !pto.tile_buf) - %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) - %tile_wk_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_wk_i__tile : !pto.tile_buf) outs(%tile_wk_i__tile_Right : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wk_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - } - %k_proj__iter_v6_pview = pto.partition_view %k_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%k_acc__tile : !pto.tile_buf) outs(%k_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto deleted file mode 100644 index db88c9a68..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto +++ /dev/null @@ -1,46 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_5(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c4096 = arith.constant 4096 : i64 - %c16 = arith.constant 16 : index - %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c64 = arith.constant 64 : index - %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %wv__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view - %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) - %tile_wv__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv__tile : !pto.tile_buf) - %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) - %tile_wv__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_wv__tile : !pto.tile_buf) outs(%tile_wv__tile_Right : !pto.tile_buf) - %v_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%tile_a__tile_Left, %tile_wv__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%v_acc__tile : !pto.tile_buf) - scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { - %1 = arith.muli %kb__idx_v0, %c128 : index - %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) - %tile_wv_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %3 = pto.partition_view %wv__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv_i__tile : !pto.tile_buf) - %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) - %tile_wv_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%tile_wv_i__tile : !pto.tile_buf) outs(%tile_wv_i__tile_Right : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wv_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - } - %v_proj__iter_v6_pview = pto.partition_view %v_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%v_acc__tile : !pto.tile_buf) outs(%v_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto deleted file mode 100644 index 4443956bc..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto +++ /dev/null @@ -1,88 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_6(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: index, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c256 = arith.constant 256 : i64 - %c512 = arith.constant 512 : i64 - %c768 = arith.constant 768 : i64 - %c1024 = arith.constant 1024 : i64 - %c1280 = arith.constant 1280 : i64 - %c1536 = arith.constant 1536 : i64 - %c1792 = arith.constant 1792 : i64 - %c2048 = arith.constant 2048 : i64 - %c2176 = arith.constant 2176 : i64 - %c2688 = arith.constant 2688 : i64 - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c524288 = arith.constant 524288 : index - %c128 = arith.constant 128 : index - %c16 = arith.constant 16 : index - %7 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c4096 = arith.constant 4096 : index - %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_cache__iter_v1_view = pto.make_tensor_view %arg6, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %v_proj__rv_v5_view = pto.make_tensor_view %arg7, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) - %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf - %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) - %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf - %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) - %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf - %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) - scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { - %8 = arith.muli %ki__idx_v0, %c128 : index - %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - %k_proj__rv_v5_pview = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %8], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%k_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%k_lo__tile : !pto.tile_buf) - %k_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf - %10 = arith.addi %8, %c64 : index - %9 = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %10], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%9 : !pto.partition_tensor_view<1x64xf32>) outs(%k_hi__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf - pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf - pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %rot_lo__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf - pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%rot_lo__tile : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c1280 : !pto.tile_buf - pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%rot_hi__tile : !pto.tile_buf) - %11 = arith.muli %arg8, %c8 : index - %12 = arith.muli %11, %c4096 : index - %13 = arith.muli %ki__idx_v0, %c4096 : index - %14 = arith.addi %12, %13 : index - %15 = arith.addi %14, %arg9 : index - %3 = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tcvt ins(%rot_lo__tile{rmode = #pto} : !pto.tile_buf) outs(%3 : !pto.tile_buf) - %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> - pto.tstore ins(%3 : !pto.tile_buf) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x64xbf16>) - %4 = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tcvt ins(%rot_hi__tile{rmode = #pto} : !pto.tile_buf) outs(%4 : !pto.tile_buf) - %k_cache__tile_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c64], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> - pto.tstore ins(%4 : !pto.tile_buf) outs(%k_cache__tile_pview : !pto.partition_tensor_view<1x64xbf16>) - %5 = pto.alloc_tile addr = %c2176 : !pto.tile_buf - %17 = arith.muli %ki__idx_v0, %c128 : index - %v_proj__rv_v5_pview = pto.partition_view %v_proj__rv_v5_view, offsets = [%arg8, %17], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> - pto.tload ins(%v_proj__rv_v5_pview : !pto.partition_tensor_view<1x128xf32>) outs(%5 : !pto.tile_buf) - %6 = pto.alloc_tile addr = %c2688 : !pto.tile_buf - pto.tcvt ins(%5{rmode = #pto} : !pto.tile_buf) outs(%6 : !pto.tile_buf) - %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> - pto.tstore ins(%6 : !pto.tile_buf) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) - } - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto deleted file mode 100644 index 2f80eb162..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto +++ /dev/null @@ -1,92 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: index, %arg10: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c256 = arith.constant 256 : i64 - %c512 = arith.constant 512 : i64 - %c768 = arith.constant 768 : i64 - %c1024 = arith.constant 1024 : i64 - %c1280 = arith.constant 1280 : i64 - %c1536 = arith.constant 1536 : i64 - %c1792 = arith.constant 1792 : i64 - %c2048 = arith.constant 2048 : i64 - %c2176 = arith.constant 2176 : i64 - %c2304 = arith.constant 2304 : i64 - %c6400 = arith.constant 6400 : i64 - %c6432 = arith.constant 6432 : i64 - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c16 = arith.constant 16 : index - %c128 = arith.constant 128 : index - %c8192 = arith.constant 8192 : index - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_padded__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %q_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) - %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf - %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) - %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf - %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) - %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf - %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) - scf.for %qi__idx_v0 = %c0 to %c8 step %c1 { - %5 = arith.addi %arg10, %qi__idx_v0 : index - %6 = arith.muli %5, %c128 : index - %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf - %q_proj__rv_v5_pview = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %6], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%q_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%q_lo__tile : !pto.tile_buf) - %q_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf - %8 = arith.addi %6, %c64 : index - %7 = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %8], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> - pto.tload ins(%7 : !pto.partition_tensor_view<1x64xf32>) outs(%q_hi__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf - pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf - pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) - %1 = pto.alloc_tile addr = %c1536 : !pto.tile_buf - pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) - %rot_lo_bf16__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tcvt ins(%1{rmode = #pto} : !pto.tile_buf) outs(%rot_lo_bf16__tile : !pto.tile_buf) - %2 = pto.alloc_tile addr = %c1280 : !pto.tile_buf - pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) - %3 = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) - %4 = pto.alloc_tile addr = %c1024 : !pto.tile_buf - pto.tadd ins(%2, %3 : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) - %rot_hi_bf16__tile = pto.alloc_tile addr = %c2176 : !pto.tile_buf - pto.tcvt ins(%4{rmode = #pto} : !pto.tile_buf) outs(%rot_hi_bf16__tile : !pto.tile_buf) - %q_padded__iter_v1_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> - pto.tstore ins(%rot_lo_bf16__tile : !pto.tile_buf) outs(%q_padded__iter_v1_pview : !pto.partition_tensor_view<1x64xbf16>) - %q_padded__tile_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c64], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> - pto.tstore ins(%rot_hi_bf16__tile : !pto.tile_buf) outs(%q_padded__tile_pview : !pto.partition_tensor_view<1x64xbf16>) - } - %oi__tile = pto.alloc_tile addr = %c2304 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf) - %li_flat__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf) - %li__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf - %mi_flat__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf - pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf) - %mi__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> - pto.tstore ins(%oi__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto deleted file mode 100644 index 53988ea99..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto +++ /dev/null @@ -1,30 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_8(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c16384 = arith.constant 16384 : i64 - %c524288 = arith.constant 524288 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout}: !pto.tensor_view - %q_padded__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> - pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf) - %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf - %q_padded__rv_v2_pview = pto.partition_view %q_padded__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> - pto.tload ins(%q_padded__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf) - %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) - %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmov ins(%k_tile__tile : !pto.tile_buf) outs(%k_tile__tile_Right : !pto.tile_buf) - %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf - pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%raw_scores_pad__tile : !pto.tile_buf) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> - pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>) - return - } -} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto deleted file mode 100644 index eb677daf6..000000000 --- a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5"} { - func.func @qwen3_decode_layer_incore_9(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0i = arith.constant 0 : i64 - %c2048 = arith.constant 2048 : i64 - %c4096 = arith.constant 4096 : i64 - %c8192 = arith.constant 8192 : i64 - %c8224 = arith.constant 8224 : i64 - %c9248 = arith.constant 9248 : i64 - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 8.838835e-02 : f32 - %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view - %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %ret1__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view - %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf - %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> - pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf) - pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf - %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf) outs(%scores_padded__tile : !pto.tile_buf) - %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf, f32) outs(%scores__tile : !pto.tile_buf) - %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf - pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%cur_mi__tile : !pto.tile_buf) - %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) - %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.texp ins(%t__tile : !pto.tile_buf) outs(%exp_scores__tile : !pto.tile_buf) - %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf - pto.tcvt ins(%exp_scores__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_bf16__tile : !pto.tile_buf) - %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf - pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_fp32__tile : !pto.tile_buf) - %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf - %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf - pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%cur_li__tile : !pto.tile_buf) - %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> - pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) - %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) - %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> - pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) - return - } -} From c5eff1ff9fb1197b961c34c85c752b843579c422 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Wed, 8 Apr 2026 17:12:49 +0800 Subject: [PATCH 10/16] test: emit mixed wrappers in kernel TU --- .../scripts/generate_testcase.py | 88 +++++++++---------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index f1e3aef77..bf5dedf79 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -226,6 +226,24 @@ def _describe_kernel_source(text: str): } +def _append_mixed_kernel_wrapper( + kernel_text: str, + kernel_name: str, + raw_params: list[str], + aic_name: str, + aiv_name: str, +) -> str: + wrapper_call_args = ", ".join(_extract_cpp_name(param) for param in raw_params) + wrapper = ( + "\n\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n" + f" {aic_name}({wrapper_call_args});\n" + f" {aiv_name}({wrapper_call_args});\n" + "}\n" + ) + return kernel_text.rstrip() + wrapper + + def _is_gm_pointer_param(param: str) -> bool: return "__gm__" in param and "*" in param @@ -1613,6 +1631,15 @@ def generate_testcase( logical_elem_count=logical_elem_count, ) + if is_mixed_kernel: + kernel_text_out = _append_mixed_kernel_wrapper( + kernel_text_out, + kernel_name, + raw_params, + kernel_info["aic_name"], + kernel_info["aiv_name"], + ) + kernel_out = output_dir / f"{testcase}_kernel.cpp" kernel_out.write_text(_replace_includes(kernel_text_out), encoding="utf-8") @@ -1631,51 +1658,22 @@ def generate_testcase( kernel_call_args_host = ", ".join(kernel_call_args_host) raw_params_host = [_rewrite_host_unsupported_types(p) for p in raw_params] launch_block_count = _infer_launch_block_count(raw_kernel_for_analysis, testcase) - if is_mixed_kernel: - wrapper_call_args = ", ".join([p["name"] for p in params]) - launch_cpp = ( - INCLUDE_REPLACEMENT - + "\n" - "#if defined(__CCE_AICORE__)\n" - f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params)});\n" - f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params)});\n" - f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n" - f" {kernel_info['aic_name']}({wrapper_call_args});\n" - f" {kernel_info['aiv_name']}({wrapper_call_args});\n" - "}\n" - "#else\n" - f"AICORE void {kernel_info['aic_name']}({', '.join(raw_params_host)});\n" - f"AICORE void {kernel_info['aiv_name']}({', '.join(raw_params_host)});\n" - f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)}) {{\n" - f" {kernel_info['aic_name']}({wrapper_call_args});\n" - f" {kernel_info['aiv_name']}({wrapper_call_args});\n" - "}\n" - "#endif\n\n" - f"void {launch_name}({launch_fn_params}) {{\n" - "#if defined(__CCE_AICORE__)\n" - f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n" - "#else\n" - f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n" - "#endif\n" - f"}}\n" - ) - else: - launch_cpp = ( - INCLUDE_REPLACEMENT - + "\n" - "#if defined(__CCE_AICORE__)\n" - f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n" - "#else\n" - f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n" - "#endif\n\n" - f"void {launch_name}({launch_fn_params}) {{\n" - "#if defined(__CCE_AICORE__)\n" - f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n" - "#else\n" - f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n" - "#endif\n" - f"}}\n" - ) + launch_cpp = ( + INCLUDE_REPLACEMENT + + "\n" + "#if defined(__CCE_AICORE__)\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params)});\n" + "#else\n" + f"__global__ AICORE void {kernel_name}({', '.join(raw_params_host)});\n" + "#endif\n\n" + f"void {launch_name}({launch_fn_params}) {{\n" + "#if defined(__CCE_AICORE__)\n" + f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_device});\n" + "#else\n" + f" {kernel_name}<<<{launch_block_count}, nullptr, stream>>>({kernel_call_args_host});\n" + "#endif\n" + f"}}\n" + ) (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8") # pto-isa selects instruction implementations based on MEMORY_BASE vs From 941c760bb1d7c5e8afd88669a3f48ff85321f31b Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 9 Apr 2026 09:23:04 +0800 Subject: [PATCH 11/16] test: restore full qwen3 tilelet PTO set --- test/samples/Qwen3Tilelet/README.md | 10 +- .../qwen3_decode_layer_incore_0.pto | 23 ++++ .../qwen3_decode_layer_incore_11.pto | 118 ++++++++++++++++++ .../qwen3_decode_layer_incore_12.pto | 31 +++++ .../qwen3_decode_layer_incore_15.pto | 47 +++++++ .../qwen3_decode_layer_incore_16.pto | 49 ++++++++ .../qwen3_decode_layer_incore_17.pto | 104 +++++++++++++++ .../qwen3_decode_layer_incore_18.pto | 75 +++++++++++ .../qwen3_decode_layer_incore_19.pto | 36 ++++++ .../qwen3_decode_layer_incore_3.pto | 45 +++++++ .../qwen3_decode_layer_incore_4.pto | 46 +++++++ .../qwen3_decode_layer_incore_5.pto | 46 +++++++ .../qwen3_decode_layer_incore_6.pto | 88 +++++++++++++ .../qwen3_decode_layer_incore_7.pto | 92 ++++++++++++++ .../qwen3_decode_layer_incore_8.pto | 30 +++++ .../qwen3_decode_layer_incore_9.pto | 49 ++++++++ 16 files changed, 882 insertions(+), 7 deletions(-) create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto create mode 100644 test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md index 010e75623..b713c4e28 100644 --- a/test/samples/Qwen3Tilelet/README.md +++ b/test/samples/Qwen3Tilelet/README.md @@ -5,10 +5,6 @@ Scope: - A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS` Notes: -- The source PyPTO program lowers to a full orchestration file plus 5 ptoas-facing mixed-kernel `.pto` inputs: - `qwen3_decode_layer_incore_1`, `qwen3_decode_layer_incore_2`, - `qwen3_decode_layer_incore_10`, `qwen3_decode_layer_incore_13`, - `qwen3_decode_layer_incore_14`. -- This sample directory vendors only those direct `ptoas` regression inputs, regenerated from the tilelet source with `BATCH_TILE=16`. -- `test/npu_validation/scripts/generate_testcase.py` now wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation. -- Custom golden assets follow the normal sample convention and live beside the `.pto` files as `_golden.py`. +- The source PyPTO program lowers to 20 `qwen3_decode_layer_incore_*.pto` fragments; this directory vendors the full emitted `.pto` set regenerated from the tilelet source with `BATCH_TILE=16`. +- `test/npu_validation/scripts/generate_testcase.py` wraps the paired `_aic`/`_aiv` entrypoints into a standalone mixed-kernel launch wrapper for board validation when the lowered fragment contains split cube/vector entrypoints. +- Custom golden assets currently exist only for the board-validation cases that need them and live beside the `.pto` files as `_golden.py`. diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto new file mode 100644 index 000000000..856f60659 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_0.pto @@ -0,0 +1,23 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_0(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %attn_out__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__iter_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %zero_q__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%zero_q__tile : !pto.tile_buf) + %zero_attn__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + pto.tcvt ins(%zero_q__tile{rmode = #pto} : !pto.tile_buf) outs(%zero_attn__tile : !pto.tile_buf) + %q_proj__iter_v1_pview = pto.partition_view %q_proj__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%zero_q__tile : !pto.tile_buf) outs(%q_proj__iter_v1_pview : !pto.partition_tensor_view<16x64xf32>) + %attn_out__iter_v1_pview = pto.partition_view %attn_out__iter_v1_view, offsets = [%c0, %arg2], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%zero_attn__tile : !pto.tile_buf) outs(%attn_out__iter_v1_pview : !pto.partition_tensor_view<16x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto new file mode 100644 index 000000000..9a8a29a01 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_11.pto @@ -0,0 +1,118 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_11(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c32 = arith.constant 32 : i64 + %c64 = arith.constant 64 : i64 + %c96 = arith.constant 96 : i64 + %c128 = arith.constant 128 : i64 + %c4224 = arith.constant 4224 : i64 + %c8320 = arith.constant 8320 : i64 + %c12416 = arith.constant 12416 : i64 + %c12448 = arith.constant 12448 : i64 + %c12480 = arith.constant 12480 : i64 + %c12512 = arith.constant 12512 : i64 + %c12544 = arith.constant 12544 : i64 + %c12576 = arith.constant 12576 : i64 + %c12608 = arith.constant 12608 : i64 + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c12416 : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c12448 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf + %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %arg9, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8320 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c12416 : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c12448 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c12480 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c12480 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c12480 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c12480 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c12512 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c12544 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c12608 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c12608 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c12608 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c12576 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto new file mode 100644 index 000000000..a9c4f9bee --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_12.pto @@ -0,0 +1,31 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_12(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c32 = arith.constant 32 : i64 + %c4128 = arith.constant 4128 : i64 + %c1 = arith.constant 1 : index + %c8192 = arith.constant 8192 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf) + %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf, !pto.tile_buf) outs(%ctx__tile : !pto.tile_buf) + %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf + %ctx_flat_bf16__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf + pto.tcvt ins(%ctx_flat__tile{rmode = #pto} : !pto.tile_buf) outs(%ctx_flat_bf16__tile : !pto.tile_buf) + %0 = arith.muli %arg3, %c128 : index + %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view -> !pto.partition_tensor_view<1x1024xbf16> + pto.tstore ins(%ctx_flat_bf16__tile : !pto.tile_buf) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto new file mode 100644 index 000000000..a45c9a509 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_15.pto @@ -0,0 +1,47 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_15(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8256 = arith.constant 8256 : i64 + %c16448 = arith.constant 16448 : i64 + %c16512 = arith.constant 16512 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %5 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %cst_1 = arith.constant 1.220703e-04 : f32 + %cst_2 = arith.constant 1.000000e-06 : f32 + %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout}: !pto.tensor_view + %sq_sum__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%sq_sum__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %5 step %c1 { + %6 = arith.muli %kb__idx_v0, %c128 : index + %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %6], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmul ins(%x_chunk__tile, %x_chunk__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %tmp_tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf + %0 = pto.alloc_tile addr = %c16448 : !pto.tile_buf + pto.trowsum ins(%t__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16448 : !pto.tile_buf + %2 = pto.alloc_tile addr = %c16512 : !pto.tile_buf + pto.tadd ins(%sq_sum__tile, %1 : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %sq_sum__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%sq_sum__tile_mv : !pto.tile_buf) + } + %3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmuls ins(%sq_sum__tile, %cst_1 : !pto.tile_buf, f32) outs(%3 : !pto.tile_buf) + %4 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tadds ins(%3, %cst_2 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) + %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.trsqrt ins(%4 : !pto.tile_buf) outs(%inv_rms__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + pto.tstore ins(%inv_rms__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<1x16xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto new file mode 100644 index 000000000..f9fa660d1 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_16.pto @@ -0,0 +1,49 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_16(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8256 = arith.constant 8256 : i64 + %c8768 = arith.constant 8768 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %2 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %cst = arith.constant 0.000000e+00 : f32 + %down_proj_tile__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_norm_tile__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %post_rms_weight__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg4, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %inv_rms__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %inv_rms__ssa_v0_pview = pto.partition_view %inv_rms__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + pto.tload ins(%inv_rms__ssa_v0_pview : !pto.partition_tensor_view<1x16xf32>) outs(%inv_rms__tile : !pto.tile_buf) + scf.for %zi__idx_v0 = %c0 to %2 step %c1 { + %3 = arith.muli %zi__idx_v0, %c128 : index + %down_zero_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%down_zero_chunk__tile : !pto.tile_buf) + %down_proj_tile__iter_v1_pview = pto.partition_view %down_proj_tile__ssa_v0_view, offsets = [%c0, %3], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%down_zero_chunk__tile : !pto.tile_buf) outs(%down_proj_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) + } + scf.for %kb__idx_v0 = %c0 to %2 step %c1 { + %4 = arith.muli %kb__idx_v0, %c128 : index + %x_chunk__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%x_chunk__tile : !pto.tile_buf) + %gamma__tile = pto.alloc_tile addr = %c8256 : !pto.tile_buf + %post_rms_weight__ssa_v0_pview = pto.partition_view %post_rms_weight__ssa_v0_view, offsets = [%c0, %4], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%post_rms_weight__ssa_v0_pview : !pto.partition_tensor_view<1x128xf32>) outs(%gamma__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %0 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.trowexpandmul ins(%x_chunk__tile, %t__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %normed__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tcolexpandmul ins(%0, %gamma__tile : !pto.tile_buf, !pto.tile_buf) outs(%normed__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c8768 : !pto.tile_buf + pto.tcvt ins(%normed__tile{rmode = #pto} : !pto.tile_buf) outs(%1 : !pto.tile_buf) + %post_norm_tile__iter_v1_pview = pto.partition_view %post_norm_tile__ssa_v0_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tstore ins(%1 : !pto.tile_buf) outs(%post_norm_tile__iter_v1_pview : !pto.partition_tensor_view<16x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto new file mode 100644 index 000000000..ae6570c56 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_17.pto @@ -0,0 +1,104 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_17_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c20480 = arith.constant 20480 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c25600 = arith.constant 25600 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_17_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_17_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %post_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %post_norm_tile__rv_v2_pview = pto.partition_view %post_norm_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%post_norm_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%post_chunk__tile : !pto.tile_buf) + %wg__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %w_gate__ssa_v0_pview = pto.partition_view %w_gate__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_gate__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wg__tile : !pto.tile_buf) + %wu__tile = pto.alloc_tile addr = %c20480 : !pto.tile_buf + %w_up__ssa_v0_pview = pto.partition_view %w_up__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%w_up__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%wu__tile : !pto.tile_buf) + %post_chunk__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%post_chunk__tile : !pto.tile_buf) outs(%post_chunk__tile_Left : !pto.tile_buf) + %wg__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wg__tile : !pto.tile_buf) outs(%wg__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wg__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + %wu__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%wu__tile : !pto.tile_buf) outs(%wu__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%post_chunk__tile_Left, %wu__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tpush_to_aiv(%0 : !pto.tile_buf) {split = 0} + } + return + } + func.func @qwen3_decode_layer_incore_17_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c32768 = arith.constant 32768 : i64 + %c36864 = arith.constant 36864 : i64 + %c45056 = arith.constant 45056 : i64 + %c49152 = arith.constant 49152 : i64 + %c40960 = arith.constant 40960 : i64 + %c53248 = arith.constant 53248 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c25600 = arith.constant 25600 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %cst_1 = arith.constant 1.000000e+00 : f32 + %post_norm_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_gate__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_up__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c25600], strides = [%c25600, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg3, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_17_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_17_c2v_slot_buffer", size = 32768, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 4096} (c2v_consumer_buf = %qwen3_decode_layer_incore_17_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + %gate_acc__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%gate_acc__tile : !pto.tile_buf) + %up_acc__tile = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%up_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c0 to %c64 step %c1 { + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %0 = pto.alloc_tile addr = %c45056 : !pto.tile_buf + pto.tadd ins(%gate_acc__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %1 = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %2 = pto.alloc_tile addr = %c49152 : !pto.tile_buf + pto.tadd ins(%up_acc__tile, %1 : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %gate_acc__tile_mv = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmov ins(%0 : !pto.tile_buf) outs(%gate_acc__tile_mv : !pto.tile_buf) + %up_acc__tile_mv = pto.alloc_tile addr = %c36864 : !pto.tile_buf + pto.tmov ins(%2 : !pto.tile_buf) outs(%up_acc__tile_mv : !pto.tile_buf) + } + %t__tile = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tneg ins(%gate_acc__tile : !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%3 : !pto.tile_buf) + %4 = pto.alloc_tile addr = %c40960 : !pto.tile_buf + pto.tadds ins(%3, %cst_1 : !pto.tile_buf, f32) outs(%4 : !pto.tile_buf) + %sigmoid__tile = pto.alloc_tile addr = %c45056 : !pto.tile_buf + pto.trecip ins(%4 : !pto.tile_buf) outs(%sigmoid__tile : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmul ins(%gate_acc__tile, %sigmoid__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %mlp_chunk__tile = pto.alloc_tile addr = %c32768 : !pto.tile_buf + pto.tmul ins(%5, %up_acc__tile : !pto.tile_buf, !pto.tile_buf) outs(%mlp_chunk__tile : !pto.tile_buf) + %mlp_chunk_bf16__tile = pto.alloc_tile addr = %c53248 : !pto.tile_buf + pto.tcvt ins(%mlp_chunk__tile{rmode = #pto} : !pto.tile_buf) outs(%mlp_chunk_bf16__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tstore ins(%mlp_chunk_bf16__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xbf16>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto new file mode 100644 index 000000000..3228a9f80 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_18.pto @@ -0,0 +1,75 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_18_aic(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_18_c2v_slot_buffer_import = pto.import_reserved_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", peer_func = @qwen3_decode_layer_incore_18_aiv} -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer_import : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %w_down_chunk__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %w_down__ssa_v0_pview = pto.partition_view %w_down__ssa_v0_view, offsets = [%arg4, %4], sizes = [%c64, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xbf16> + pto.tload ins(%w_down__ssa_v0_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%w_down_chunk__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %mlp_chunk_bf16__ssa_v0_pview = pto.partition_view %mlp_chunk_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xbf16> + pto.tload ins(%mlp_chunk_bf16__ssa_v0_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %w_down_chunk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%w_down_chunk__tile : !pto.tile_buf) outs(%w_down_chunk__tile_Right : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %w_down_chunk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + pto.tpush_to_aiv(%t__tile : !pto.tile_buf) {split = 0} + } + return + } + func.func @qwen3_decode_layer_incore_18_aiv(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c65536 = arith.constant 65536 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c25600 = arith.constant 25600 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__co_l0_iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %mlp_chunk_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %w_down__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c25600, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %qwen3_decode_layer_incore_18_c2v_slot_buffer = pto.reserve_buffer {name = "qwen3_decode_layer_incore_18_c2v_slot_buffer", size = 65536, location = #pto.address_space, auto = false, base = 0} -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 8192} (c2v_consumer_buf = %qwen3_decode_layer_incore_18_c2v_slot_buffer : i32, v2c_consumer_buf = %c0_i32 : i32) + scf.for %dob__ci_idx_v0 = %c0 to %c4 step %c1 { + %0 = arith.muli %arg3, %c4 : index + %1 = arith.addi %0, %dob__ci_idx_v0 : index + %2 = arith.muli %1, %c1 : index + %3 = arith.addi %c0, %2 : index + %4 = arith.muli %3, %c128 : index + %down_prev__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf + %down_proj_tile__co_l1_iter_v6_pview = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%down_proj_tile__co_l1_iter_v6_pview : !pto.partition_tensor_view<16x128xf32>) outs(%down_prev__tile : !pto.tile_buf) + %t__tile_Vec = pto.tpop_from_aic {split = 0} -> !pto.tile_buf + %down_next__tile = pto.alloc_tile addr = %c65536 : !pto.tile_buf + pto.tadd ins(%down_prev__tile, %t__tile_Vec : !pto.tile_buf, !pto.tile_buf) outs(%down_next__tile : !pto.tile_buf) + pto.tfree_from_aic {split = 0} + %5 = pto.partition_view %down_proj_tile__co_l0_iter_v6_view, offsets = [%c0, %4], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%down_next__tile : !pto.tile_buf) outs(%5 : !pto.partition_tensor_view<16x128xf32>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto new file mode 100644 index 000000000..776c7aed2 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_19.pto @@ -0,0 +1,36 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_19(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c8192 = arith.constant 8192 : i64 + %c16384 = arith.constant 16384 : i64 + %c16 = arith.constant 16 : index + %2 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %down_proj_tile__rv_v5_view = pto.make_tensor_view %arg0, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view + %out__co_l0_iter_v3_view = pto.make_tensor_view %arg1, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view + %resid1_tile__co_l0_rv_v4_view = pto.make_tensor_view %arg2, shape = [%c16, %2], strides = [%2, %c1] {layout = #pto.layout}: !pto.tensor_view + scf.for %ob__ci_idx_v0 = %c0 to %c4 step %c1 { + %3 = arith.muli %arg4, %c4 : index + %4 = arith.addi %3, %ob__ci_idx_v0 : index + %5 = arith.muli %4, %c1 : index + %6 = arith.addi %c0, %5 : index + %7 = arith.muli %6, %c128 : index + %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %down_proj_tile__rv_v5_pview = pto.partition_view %down_proj_tile__rv_v5_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%down_proj_tile__rv_v5_pview : !pto.partition_tensor_view<16x128xf32>) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c8192 : !pto.tile_buf + %resid1_tile__co_l0_rv_v4_pview = pto.partition_view %resid1_tile__co_l0_rv_v4_view, offsets = [%c0, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%resid1_tile__co_l0_rv_v4_pview : !pto.partition_tensor_view<16x128xf32>) outs(%0 : !pto.tile_buf) + %down_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tadd ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%down_acc__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16384 : !pto.tile_buf + pto.tcvt ins(%down_acc__tile{rmode = #pto} : !pto.tile_buf) outs(%1 : !pto.tile_buf) + %out__co_l1_iter_v3_pview = pto.partition_view %out__co_l0_iter_v3_view, offsets = [%arg3, %7], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tstore ins(%1 : !pto.tile_buf) outs(%out__co_l1_iter_v3_pview : !pto.partition_tensor_view<16x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto new file mode 100644 index 000000000..f8bccc6ac --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_3.pto @@ -0,0 +1,45 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_3(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %wq__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) + %tile_b__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wq__ssa_v0_pview = pto.partition_view %wq__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wq__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b__tile : !pto.tile_buf) + %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) + %tile_b__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_b__tile : !pto.tile_buf) outs(%tile_b__tile_Right : !pto.tile_buf) + %q_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%tile_a__tile_Left, %tile_b__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%q_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) + %tile_b_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %3 = pto.partition_view %wq__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_b_i__tile : !pto.tile_buf) + %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) + %tile_b_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_b_i__tile : !pto.tile_buf) outs(%tile_b_i__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_b_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + } + %q_proj__iter_v6_pview = pto.partition_view %q_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%q_acc__tile : !pto.tile_buf) outs(%q_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto new file mode 100644 index 000000000..9a2756f1c --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_4.pto @@ -0,0 +1,46 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_4(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c1024 = arith.constant 1024 : index + %c1 = arith.constant 1 : index + %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %k_proj__iter_v6_view = pto.make_tensor_view %arg0, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %normed_tile__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %wk__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) + %tile_wk__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wk__ssa_v0_pview = pto.partition_view %wk__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wk__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk__tile : !pto.tile_buf) + %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) + %tile_wk__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wk__tile : !pto.tile_buf) outs(%tile_wk__tile_Right : !pto.tile_buf) + %k_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%tile_a__tile_Left, %tile_wk__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%k_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) + %tile_wk_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %3 = pto.partition_view %wk__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wk_i__tile : !pto.tile_buf) + %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) + %tile_wk_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wk_i__tile : !pto.tile_buf) outs(%tile_wk_i__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wk_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + } + %k_proj__iter_v6_pview = pto.partition_view %k_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%k_acc__tile : !pto.tile_buf) outs(%k_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto new file mode 100644 index 000000000..db88c9a68 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_5.pto @@ -0,0 +1,46 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_5(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c4096 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c8192 = arith.constant 8192 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %normed_tile__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__iter_v6_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %wv__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c8192, %c1024], strides = [%c1024, %c1] {layout = #pto.layout}: !pto.tensor_view + %tile_a__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %normed_tile__rv_v2_pview = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%normed_tile__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a__tile : !pto.tile_buf) + %tile_wv__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %wv__ssa_v0_pview = pto.partition_view %wv__ssa_v0_view, offsets = [%c0, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%wv__ssa_v0_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv__tile : !pto.tile_buf) + %tile_a__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a__tile : !pto.tile_buf) outs(%tile_a__tile_Left : !pto.tile_buf) + %tile_wv__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wv__tile : !pto.tile_buf) outs(%tile_wv__tile_Right : !pto.tile_buf) + %v_acc__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%tile_a__tile_Left, %tile_wv__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%v_acc__tile : !pto.tile_buf) + scf.for %kb__idx_v0 = %c1 to %c64 step %c1 { + %1 = arith.muli %kb__idx_v0, %c128 : index + %tile_a_i__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %2 = pto.partition_view %normed_tile__rv_v2_view, offsets = [%c0, %1], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%2 : !pto.partition_tensor_view<16x128xbf16>) outs(%tile_a_i__tile : !pto.tile_buf) + %tile_wv_i__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %3 = pto.partition_view %wv__ssa_v0_view, offsets = [%1, %arg4], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%3 : !pto.partition_tensor_view<128x64xbf16>) outs(%tile_wv_i__tile : !pto.tile_buf) + %tile_a_i__tile_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_a_i__tile : !pto.tile_buf) outs(%tile_a_i__tile_Left : !pto.tile_buf) + %tile_wv_i__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%tile_wv_i__tile : !pto.tile_buf) outs(%tile_wv_i__tile_Right : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul.acc ins(%0, %tile_a_i__tile_Left, %tile_wv_i__tile_Right : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + } + %v_proj__iter_v6_pview = pto.partition_view %v_proj__iter_v6_view, offsets = [%arg3, %arg4], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%v_acc__tile : !pto.tile_buf) outs(%v_proj__iter_v6_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto new file mode 100644 index 000000000..4443956bc --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_6.pto @@ -0,0 +1,88 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_6(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: index, %arg9: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c512 = arith.constant 512 : i64 + %c768 = arith.constant 768 : i64 + %c1024 = arith.constant 1024 : i64 + %c1280 = arith.constant 1280 : i64 + %c1536 = arith.constant 1536 : i64 + %c1792 = arith.constant 1792 : i64 + %c2048 = arith.constant 2048 : i64 + %c2176 = arith.constant 2176 : i64 + %c2688 = arith.constant 2688 : i64 + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %7 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c4096 = arith.constant 4096 : index + %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_cache__iter_v1_view = pto.make_tensor_view %arg6, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %v_proj__rv_v5_view = pto.make_tensor_view %arg7, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) + %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) + %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf + %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) + %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf + %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) + scf.for %ki__idx_v0 = %c0 to %c8 step %c1 { + %8 = arith.muli %ki__idx_v0, %c128 : index + %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %k_proj__rv_v5_pview = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %8], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%k_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%k_lo__tile : !pto.tile_buf) + %k_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf + %10 = arith.addi %8, %c64 : index + %9 = pto.partition_view %k_proj__rv_v5_view, offsets = [%arg8, %10], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%9 : !pto.partition_tensor_view<1x64xf32>) outs(%k_hi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf + pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %rot_lo__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%rot_lo__tile : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c1280 : !pto.tile_buf + pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tadd ins(%1, %2 : !pto.tile_buf, !pto.tile_buf) outs(%rot_hi__tile : !pto.tile_buf) + %11 = arith.muli %arg8, %c8 : index + %12 = arith.muli %11, %c4096 : index + %13 = arith.muli %ki__idx_v0, %c4096 : index + %14 = arith.addi %12, %13 : index + %15 = arith.addi %14, %arg9 : index + %3 = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%rot_lo__tile{rmode = #pto} : !pto.tile_buf) outs(%3 : !pto.tile_buf) + %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%3 : !pto.tile_buf) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x64xbf16>) + %4 = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%rot_hi__tile{rmode = #pto} : !pto.tile_buf) outs(%4 : !pto.tile_buf) + %k_cache__tile_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%15, %c64], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%4 : !pto.tile_buf) outs(%k_cache__tile_pview : !pto.partition_tensor_view<1x64xbf16>) + %5 = pto.alloc_tile addr = %c2176 : !pto.tile_buf + %17 = arith.muli %ki__idx_v0, %c128 : index + %v_proj__rv_v5_pview = pto.partition_view %v_proj__rv_v5_view, offsets = [%arg8, %17], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xf32> + pto.tload ins(%v_proj__rv_v5_pview : !pto.partition_tensor_view<1x128xf32>) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c2688 : !pto.tile_buf + pto.tcvt ins(%5{rmode = #pto} : !pto.tile_buf) outs(%6 : !pto.tile_buf) + %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%15, %c0], sizes = [%c1, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<1x128xbf16> + pto.tstore ins(%6 : !pto.tile_buf) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>) + } + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto new file mode 100644 index 000000000..2f80eb162 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_7.pto @@ -0,0 +1,92 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: index, %arg10: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c256 = arith.constant 256 : i64 + %c512 = arith.constant 512 : i64 + %c768 = arith.constant 768 : i64 + %c1024 = arith.constant 1024 : i64 + %c1280 = arith.constant 1280 : i64 + %c1536 = arith.constant 1536 : i64 + %c1792 = arith.constant 1792 : i64 + %c2048 = arith.constant 2048 : i64 + %c2176 = arith.constant 2176 : i64 + %c2304 = arith.constant 2304 : i64 + %c6400 = arith.constant 6400 : i64 + %c6432 = arith.constant 6432 : i64 + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %c8192 = arith.constant 8192 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_padded__ssa_v0_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %q_proj__rv_v5_view = pto.make_tensor_view %arg3, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg6, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg7, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret2__out_view = pto.make_tensor_view %arg8, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf) + %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf) + %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf + %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf) + %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf + %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf) + scf.for %qi__idx_v0 = %c0 to %c8 step %c1 { + %5 = arith.addi %arg10, %qi__idx_v0 : index + %6 = arith.muli %5, %c128 : index + %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf + %q_proj__rv_v5_pview = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %6], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%q_proj__rv_v5_pview : !pto.partition_tensor_view<1x64xf32>) outs(%q_lo__tile : !pto.tile_buf) + %q_hi__tile = pto.alloc_tile addr = %c1280 : !pto.tile_buf + %8 = arith.addi %6, %c64 : index + %7 = pto.partition_view %q_proj__rv_v5_view, offsets = [%arg9, %8], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xf32> + pto.tload ins(%7 : !pto.partition_tensor_view<1x64xf32>) outs(%q_hi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c1792 : !pto.tile_buf + pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf, !pto.tile_buf) outs(%0 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c1536 : !pto.tile_buf + pto.tsub ins(%t__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%1 : !pto.tile_buf) + %rot_lo_bf16__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%1{rmode = #pto} : !pto.tile_buf) outs(%rot_lo_bf16__tile : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c1280 : !pto.tile_buf + pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%2 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf, !pto.tile_buf) outs(%3 : !pto.tile_buf) + %4 = pto.alloc_tile addr = %c1024 : !pto.tile_buf + pto.tadd ins(%2, %3 : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %rot_hi_bf16__tile = pto.alloc_tile addr = %c2176 : !pto.tile_buf + pto.tcvt ins(%4{rmode = #pto} : !pto.tile_buf) outs(%rot_hi_bf16__tile : !pto.tile_buf) + %q_padded__iter_v1_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%rot_lo_bf16__tile : !pto.tile_buf) outs(%q_padded__iter_v1_pview : !pto.partition_tensor_view<1x64xbf16>) + %q_padded__tile_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%qi__idx_v0, %c64], sizes = [%c1, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%rot_hi_bf16__tile : !pto.tile_buf) outs(%q_padded__tile_pview : !pto.partition_tensor_view<1x64xbf16>) + } + %oi__tile = pto.alloc_tile addr = %c2304 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf) + %li_flat__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c6400 : !pto.tile_buf + %mi_flat__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf + pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c6432 : !pto.tile_buf + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<8x128xf32> + pto.tstore ins(%oi__tile : !pto.tile_buf) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto new file mode 100644 index 000000000..53988ea99 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_8.pto @@ -0,0 +1,30 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_8(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c16384 = arith.constant 16384 : i64 + %c524288 = arith.constant 524288 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout}: !pto.tensor_view + %q_padded__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<128x64xbf16> + pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf) + %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf + %q_padded__rv_v2_pview = pto.partition_view %q_padded__rv_v2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xbf16> + pto.tload ins(%q_padded__rv_v2_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf) + %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%lhs_mat : !pto.tile_buf) outs(%lhs_mat_Left : !pto.tile_buf) + %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%k_tile__tile : !pto.tile_buf) outs(%k_tile__tile_Right : !pto.tile_buf) + %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf, !pto.tile_buf) outs(%raw_scores_pad__tile : !pto.tile_buf) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<16x64xf32> + pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>) + return + } +} diff --git a/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto new file mode 100644 index 000000000..eb677daf6 --- /dev/null +++ b/test/samples/Qwen3Tilelet/qwen3_decode_layer_incore_9.pto @@ -0,0 +1,49 @@ +module attributes {pto.target_arch = "a5"} { + func.func @qwen3_decode_layer_incore_9(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c2048 = arith.constant 2048 : i64 + %c4096 = arith.constant 4096 : i64 + %c8192 = arith.constant 8192 : i64 + %c8224 = arith.constant 8224 : i64 + %c9248 = arith.constant 9248 : i64 + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 8.838835e-02 : f32 + %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout}: !pto.tensor_view + %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %ret1__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout}: !pto.tensor_view + %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf + %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf) + pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf + %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf) outs(%scores_padded__tile : !pto.tile_buf) + %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf, f32) outs(%scores__tile : !pto.tile_buf) + %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf + pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf, !pto.tile_buf) outs(%cur_mi__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.texp ins(%t__tile : !pto.tile_buf) outs(%exp_scores__tile : !pto.tile_buf) + %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf + pto.tcvt ins(%exp_scores__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_bf16__tile : !pto.tile_buf) + %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf + pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto} : !pto.tile_buf) outs(%exp_scores_fp32__tile : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf + pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf, !pto.tile_buf) outs(%cur_li__tile : !pto.tile_buf) + %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xbf16> + pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) + %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_li__tile : !pto.tile_buf) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>) + %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<8x1xf32> + pto.tstore ins(%cur_mi__tile : !pto.tile_buf) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>) + return + } +} From 53776e53ce0395905c632d3834d557d4477aeda0 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 9 Apr 2026 12:00:48 +0800 Subject: [PATCH 12/16] test: harden mixed-kernel wrapper generation --- .../scripts/generate_testcase.py | 230 +++++++++++++++++- 1 file changed, 221 insertions(+), 9 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index bf5dedf79..f1afba770 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -152,6 +152,66 @@ def _find_matching_brace(text: str, open_brace_index: int) -> Optional[int]: return None +def _extract_function_body(function_text: str) -> str: + brace_index = function_text.find("{") + if brace_index < 0: + return "" + end_index = _find_matching_brace(function_text, brace_index) + if end_index is None: + return "" + body = function_text[brace_index + 1:end_index].strip() + body = re.sub(r"\breturn\s*;\s*$", "", body, flags=re.S).rstrip() + return body + + +def _strip_ptoas_auto_sync_tail(body: str) -> tuple[str, bool]: + pattern = re.compile( + r"\n?\s*ptoas_auto_sync_tail\s*\([^;]*\)\s*;\s*$", + re.S, + ) + updated = pattern.sub("", body.rstrip()) + return updated.rstrip(), updated != body.rstrip() + + +def _indent_block(text: str, spaces: int = 4) -> str: + prefix = " " * spaces + return "\n".join((prefix + line) if line else "" for line in text.splitlines()) + + +def _split_cpp_args(text: str): + text = text.strip() + if not text: + return [] + parts = [] + depth_angle = 0 + depth_paren = 0 + depth_brace = 0 + depth_bracket = 0 + start = 0 + for idx, ch in enumerate(text): + if ch == "<": + depth_angle += 1 + elif ch == ">": + depth_angle = max(depth_angle - 1, 0) + elif ch == "(": + depth_paren += 1 + elif ch == ")": + depth_paren = max(depth_paren - 1, 0) + elif ch == "{": + depth_brace += 1 + elif ch == "}": + depth_brace = max(depth_brace - 1, 0) + elif ch == "[": + depth_bracket += 1 + elif ch == "]": + depth_bracket = max(depth_bracket - 1, 0) + elif ch == "," and depth_angle == 0 and depth_paren == 0 and depth_brace == 0 and depth_bracket == 0: + parts.append(text[start:idx].strip()) + start = idx + 1 + parts.append(text[start:].strip()) + return [part for part in parts if part] + + def _extract_aicore_functions(text: str): pattern = re.compile( r"(?P__global__\s+)?AICORE\s+void\s+(?P\w+)\s*\((?P[^)]*)\)\s*\{", @@ -211,8 +271,8 @@ def _describe_kernel_source(text: str): "raw_params": params, "analysis_texts": [group["aic"]["text"], group["aiv"]["text"]], "writer_texts": [group["aiv"]["text"]], - "aic_name": group["aic"]["name"], - "aiv_name": group["aiv"]["name"], + "aic_text": group["aic"]["text"], + "aiv_text": group["aiv"]["text"], "call_text": group["aiv"]["text"], } @@ -230,15 +290,167 @@ def _append_mixed_kernel_wrapper( kernel_text: str, kernel_name: str, raw_params: list[str], - aic_name: str, - aiv_name: str, + aic_text: str, + aiv_text: str, ) -> str: - wrapper_call_args = ", ".join(_extract_cpp_name(param) for param in raw_params) + pipe_decl_pattern = re.compile( + r"^(?P\s*)auto\s+(?P\w+)\s*=\s*(?PTPipe<[^;=]+>)\s*\((?P[^;]*)\)\s*;\s*$", + re.M, + ) + param_names = {_extract_cpp_name(param) for param in raw_params} + safe_identifiers = {"nullptr", "NULL", "true", "false"} + + def _find_decl_init(prefix: str, name: str): + pattern = re.compile( + rf"^\s*(?P[^=\n;]+?)\s+{re.escape(name)}\s*=\s*(?P[^;]+);\s*$", + re.M, + ) + match = None + for current in pattern.finditer(prefix): + match = current + if match is None: + return None, None, None + return match.group("type").strip(), match.group("init").strip(), match.start() + + def _render_pointer_init(type_text: str, init_text: str) -> str: + expr = init_text.strip() + if "*" not in type_text: + return expr + if expr.startswith("(") or expr.startswith("reinterpret_cast") or expr.startswith("static_cast"): + return expr + return f"({type_text}){expr}" + + def _resolve_ctor_arg(arg_text: str, prefix: str, depth: int = 0): + arg_text = arg_text.strip() + if not arg_text: + return None + if depth > 8: + return None + if not re.fullmatch(r"[A-Za-z_]\w*", arg_text): + return arg_text + if arg_text in safe_identifiers: + return arg_text + if arg_text in param_names: + return arg_text + type_text, init_text, decl_start = _find_decl_init(prefix, arg_text) + if type_text is None or init_text is None: + return None + resolved_init = init_text + if ( + re.fullmatch(r"[A-Za-z_]\w*", init_text) + and init_text not in param_names + and init_text not in safe_identifiers + ): + resolved_init = _resolve_ctor_arg(init_text, prefix[:decl_start], depth + 1) + if resolved_init is None: + return None + return _render_pointer_init(type_text, resolved_init) + + def _extract_pipe_decls(body: str): + decls = [] + for match in pipe_decl_pattern.finditer(body): + ctor_args = _split_cpp_args(match.group("args")) + prefix = body[:match.start()] + resolved_args = [] + for arg in ctor_args: + resolved = _resolve_ctor_arg(arg, prefix) + if resolved is None: + break + resolved_args.append(resolved) + else: + decls.append( + { + "name": match.group("name"), + "type_text": match.group("type").strip(), + "ctor_args": tuple(resolved_args), + "span": match.span(), + } + ) + return decls + + def _rewrite_body(body: str, replacements): + rewritten = body + for replacement in sorted(replacements, key=lambda item: item["span"][0], reverse=True): + start, end = replacement["span"] + rewritten = rewritten[:start] + rewritten[end:] + for replacement in replacements: + rewritten = re.sub( + rf"\b{re.escape(replacement['old_name'])}\b", + replacement["new_name"], + rewritten, + ) + return rewritten.strip() + + def _next_shared_name(seed: int, texts: list[str]) -> str: + index = seed + while True: + name = f"__ptoas_shared_pipe{index}" + if all(name not in text for text in texts): + return name + index += 1 + + aic_body = _extract_function_body(aic_text) + aiv_body = _extract_function_body(aiv_text) + aic_body, aic_has_tail = _strip_ptoas_auto_sync_tail(aic_body) + aiv_body, aiv_has_tail = _strip_ptoas_auto_sync_tail(aiv_body) + aic_decls = _extract_pipe_decls(aic_body) + aiv_decls = _extract_pipe_decls(aiv_body) + + shared_pairs = [] + aiv_by_key = {} + for decl in aiv_decls: + key = (decl["type_text"], decl["ctor_args"]) + aiv_by_key.setdefault(key, []).append(decl) + for decl in aic_decls: + key = (decl["type_text"], decl["ctor_args"]) + bucket = aiv_by_key.get(key) + if not bucket: + continue + shared_pairs.append((decl, bucket.pop(0))) + + shared_decls = [] + aic_replacements = [] + aiv_replacements = [] + shared_seed = 0 + texts_for_name_check = [kernel_text, aic_body, aiv_body] + for aic_decl, aiv_decl in shared_pairs: + shared_name = _next_shared_name(shared_seed, texts_for_name_check) + shared_seed += 1 + texts_for_name_check.append(shared_name) + shared_decls.append( + f" auto {shared_name} = {aic_decl['type_text']}({', '.join(aic_decl['ctor_args'])});" + ) + aic_replacements.append( + { + "old_name": aic_decl["name"], + "new_name": shared_name, + "span": aic_decl["span"], + } + ) + aiv_replacements.append( + { + "old_name": aiv_decl["name"], + "new_name": shared_name, + "span": aiv_decl["span"], + } + ) + + wrapper_blocks = [] + for body in (_rewrite_body(aic_body, aic_replacements), _rewrite_body(aiv_body, aiv_replacements)): + if not body: + continue + wrapper_blocks.append(" {\n" + _indent_block(body) + "\n }") + + if not wrapper_blocks: + return kernel_text + wrapper = ( "\n\n" f"__global__ AICORE void {kernel_name}({', '.join(raw_params)}) {{\n" - f" {aic_name}({wrapper_call_args});\n" - f" {aiv_name}({wrapper_call_args});\n" + + ("\n".join(shared_decls) + ("\n\n" if shared_decls else "")) + + "\n".join(wrapper_blocks) + + ("\n ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);" if (aic_has_tail or aiv_has_tail) else "") + + "\n" "}\n" ) return kernel_text.rstrip() + wrapper @@ -1636,8 +1848,8 @@ def generate_testcase( kernel_text_out, kernel_name, raw_params, - kernel_info["aic_name"], - kernel_info["aiv_name"], + kernel_info["aic_text"], + kernel_info["aiv_text"], ) kernel_out = output_dir / f"{testcase}_kernel.cpp" From df70b4ed49a4768aa468b91a4bfaca818257f3e6 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 9 Apr 2026 14:32:59 +0800 Subject: [PATCH 13/16] fix(emitc): guard a5 nosplit vector pipe consumers --- lib/PTO/Transforms/PTOToEmitC.cpp | 82 ++++++++++++++++++- .../basic/tpush_tpop_frontend_lowering_a5.pto | 2 + 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index 43438a364..6c08d60d6 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -406,6 +406,7 @@ static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc, static Value castSignlessIntToUnsignedSameWidth(ConversionPatternRewriter &rewriter, Location loc, Value v, unsigned bitWidth); +static bool needsA5NoSplitVectorGuard(Operation *op); static FailureOr getTileSplitToken(int64_t split) { switch (split) { @@ -2517,6 +2518,9 @@ struct FuncToEmitC : public OpConversionPattern { emitcFunc.setSpecifiersAttr(rewriter.getStrArrayAttr({"AICORE"})); } + std::optional kernelKindMacro = getKernelKindMacro(op); + bool needsNoSplitGuard = needsA5NoSplitVectorGuard(op.getOperation()); + // Inline the original body, then convert region/block argument types to // match the converted signature (also covers CFG blocks introduced by // pre-lowering, e.g. scf.while -> cf.br/cf.cond_br). @@ -2531,8 +2535,6 @@ struct FuncToEmitC : public OpConversionPattern { *getTypeConverter(), &entryConv))) return failure(); - std::optional kernelKindMacro = getKernelKindMacro(op); - // Preserve the existing function prologue shape. `kernel_kind` functions are // emitted with the same macro guard/reset sequence that used to come from // early pto.section wrapping, but only after SCF pre-lowering has finished. @@ -2547,6 +2549,9 @@ struct FuncToEmitC : public OpConversionPattern { rewriter.create(op.getLoc(), "set_mask_norm();"); rewriter.create(op.getLoc(), "set_vector_mask(-1, -1);"); + if (needsNoSplitGuard) + rewriter.create( + op.getLoc(), "if (get_subblockid() == 0) {"); } } } @@ -2554,6 +2559,8 @@ struct FuncToEmitC : public OpConversionPattern { if (kernelKindMacro) { Block &lastBlock = emitcFunc.getBody().back(); rewriter.setInsertionPoint(lastBlock.getTerminator()); + if (*kernelKindMacro == "__DAV_VEC__" && needsNoSplitGuard) + rewriter.create(op.getLoc(), "}"); std::string endMacro = "#endif // " + kernelKindMacro->str() + "\n"; rewriter.create(op.getLoc(), endMacro); } @@ -8956,6 +8963,68 @@ class ArithCmpIToEmitC : public OpConversionPattern { //===----------------------------------------------------------------------===// // Section Op Lowering //===----------------------------------------------------------------------===// +static bool isA5NoSplitPipeOp(Operation *op) { + if (auto tpush = dyn_cast(op)) + return tpush.getSplit() == 0; + if (auto tpop = dyn_cast(op)) + return tpop.getSplit() == 0; + if (auto tfree = dyn_cast(op)) + return tfree.getSplit() == 0; + if (auto tpush = dyn_cast(op)) + return tpush.getSplit() == 0; + if (auto tpush = dyn_cast(op)) + return tpush.getSplit() == 0; + if (auto tpop = dyn_cast(op)) + return tpop.getSplit() == 0; + if (auto tpop = dyn_cast(op)) + return tpop.getSplit() == 0; + if (auto tfree = dyn_cast(op)) + return tfree.getSplit() == 0; + if (auto tfree = dyn_cast(op)) + return tfree.getSplit() == 0; + return false; +} + +static bool hasExplicitSubblockControl(Operation *op) { + bool hasControl = false; + op->walk([&](Operation *nested) { + if (isa(nested)) { + hasControl = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + return hasControl; +} + +static bool needsA5NoSplitVectorGuard(Operation *op) { + auto arch = getTargetArch(op); + if (arch != PTOArch::A5) + return false; + bool isVectorScope = isa(op); + if (auto func = dyn_cast(op)) { + if (auto kernelKindAttr = + func->getAttrOfType( + FunctionKernelKindAttr::name)) { + isVectorScope = + kernelKindAttr.getKernelKind() == FunctionKernelKind::Vector; + } + } + if (!isVectorScope) + return false; + if (hasExplicitSubblockControl(op)) + return false; + + bool hasNoSplitPipe = false; + op->walk([&](Operation *nested) { + if (!isA5NoSplitPipeOp(nested)) + return WalkResult::advance(); + hasNoSplitPipe = true; + return WalkResult::interrupt(); + }); + return hasNoSplitPipe; +} + template struct SectionToEmitC : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -8972,6 +9041,7 @@ struct SectionToEmitC : public OpConversionPattern { matchAndRewrite(SectionOpTy op, typename SectionOpTy::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op.getLoc(); + bool needsNoSplitGuard = needsA5NoSplitVectorGuard(op.getOperation()); std::string startMacro = "\n#if defined(" + getMacroName() + ")"; rewriter.create(loc, startMacro); @@ -8984,11 +9054,19 @@ struct SectionToEmitC : public OpConversionPattern { rewriter.create(loc, "set_vector_mask(-1, -1);"); } + if (needsNoSplitGuard) { + rewriter.create( + loc, "if (get_subblockid() == 0) {"); + } + Block &innerBlock = op.getBody().front(); if (!innerBlock.empty()) { rewriter.inlineBlockBefore(&innerBlock, op.getOperation(), ValueRange{}); } + if (needsNoSplitGuard) + rewriter.create(loc, "}"); + std::string endMacro = "#endif // " + getMacroName() + "\n"; rewriter.create(loc, endMacro); diff --git a/test/basic/tpush_tpop_frontend_lowering_a5.pto b/test/basic/tpush_tpop_frontend_lowering_a5.pto index 84e20b799..9787217f6 100644 --- a/test/basic/tpush_tpop_frontend_lowering_a5.pto +++ b/test/basic/tpush_tpop_frontend_lowering_a5.pto @@ -66,6 +66,7 @@ module { // A5: TFREE, TileSplitAxis::TILE_NO_SPLIT>( // A5-LABEL: AICORE void vector_kernel( +// A5: if (get_subblockid() == 0) { // A5: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_BOTH, 1024, 4>( // A5: Tile {{v[0-9]+}}; // A5: Tile {{v[0-9]+}}; @@ -75,3 +76,4 @@ module { // A5: Tile {{v[0-9]+}}; // A5: TNEG( // A5: TFREE, TileSplitAxis::TILE_NO_SPLIT>( +// A5: } From f0bb59deb2d8252322f5ef4bdf76e2d89cdf5c4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=89=A2=E5=A4=A7?= Date: Thu, 9 Apr 2026 16:59:15 +0800 Subject: [PATCH 14/16] test: allow qwen3 tilelet cases on a3 --- .github/workflows/ci.yml | 2 +- test/samples/Qwen3Tilelet/README.md | 2 +- test/samples/runop.sh | 8 ++------ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e7b3bdf1..7e11b5fbf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -293,7 +293,7 @@ jobs: # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based # on SOC_VERSION to keep the remote validation portable. A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync" - A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,qwen3_decode_layer_incore_1,qwen3_decode_layer_incore_2,qwen3_decode_layer_incore_10,qwen3_decode_layer_incore_13,qwen3_decode_layer_incore_14" + A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5" sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')" is_a5=0 diff --git a/test/samples/Qwen3Tilelet/README.md b/test/samples/Qwen3Tilelet/README.md index b713c4e28..f3fbcdb95 100644 --- a/test/samples/Qwen3Tilelet/README.md +++ b/test/samples/Qwen3Tilelet/README.md @@ -2,7 +2,7 @@ Qwen3 tilelet PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_ Scope: - compile-regression inputs for `ptoas` -- A5-only kernels; `runop.sh` injects `--pto-arch a5 --pto-level=level3` for this directory unless the caller already overrides `PTOAS_FLAGS` +- tilelet kernels that default to `--pto-arch a5 --pto-level=level3` in `runop.sh`, but can also be compiled on A3 when the caller overrides `PTOAS_FLAGS` Notes: - The source PyPTO program lowers to 20 `qwen3_decode_layer_incore_*.pto` fragments; this directory vendors the full emitted `.pto` set regenerated from the tilelet source with `BATCH_TILE=16`. diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 8ac6921a7..4d0d38c81 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -154,8 +154,8 @@ process_one_dir() { use_ptobc_roundtrip=1 fi # Qwen3 tilelet kernels currently serve as direct ptoas compile-regression - # coverage. They require A5/level3 lowering, but are not expected to - # roundtrip through ptobc yet. + # coverage. Default them to A5/level3 lowering when the caller does not + # provide an explicit arch, but let A3/A5 callers override PTOAS_FLAGS. if [[ "$A" == "Qwen3Tilelet" ]]; then use_ptobc_roundtrip=0 fi @@ -942,10 +942,6 @@ PY if [[ "$A" == "Qwen3Tilelet" ]]; then cpp="${out_subdir}/${base}-pto.cpp" fi - if [[ "$A" == "Qwen3Tilelet" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then - echo -e "${A}(${base}.pto)\tSKIP\trequires --pto-arch=a5" - continue - fi local sample_use_ptobc_roundtrip="$use_ptobc_roundtrip" # TODO(ptobc): decode of this regression currently fails with From 4421585430bf4375949a5bfe91f961b29b409322 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Mon, 13 Apr 2026 11:49:31 +0800 Subject: [PATCH 15/16] test: mark qwen tilelet samples as a5-only --- .github/workflows/ci.yml | 23 ++++++++++++----------- test/samples/runop.sh | 14 +++++++++++++- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e11b5fbf..e27c1c2dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -288,12 +288,15 @@ jobs: fi fi - # Some validation samples have A3 vs A5 variants due to stricter - # pto-isa static checks on Ascend950 (A5). When running the full test - # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based - # on SOC_VERSION to keep the remote validation portable. + # Some validation samples are arch-specific due to stricter pto-isa + # static checks and A5-only tile layouts. Always skip the + # non-matching variant based on SOC_VERSION, even for explicit + # RUN_ONLY_CASES requests, so remote validation does not try to force + # A5-only cases through an A3 flow or vice versa. A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync" - A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5" + QWEN3_TILELET_A5_ONLY_CASES="$(printf 'qwen3_decode_layer_incore_%s,' {0..19})" + QWEN3_TILELET_A5_ONLY_CASES="${QWEN3_TILELET_A5_ONLY_CASES%,}" + A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,${QWEN3_TILELET_A5_ONLY_CASES}" sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')" is_a5=0 @@ -301,12 +304,10 @@ jobs: is_a5=1 fi - if [[ -z "${RUN_ONLY_CASES}" ]]; then - if [[ ${is_a5} -eq 1 ]]; then - SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A3_ONLY_CASES}" - else - SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A5_ONLY_CASES}" - fi + if [[ ${is_a5} -eq 1 ]]; then + SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A3_ONLY_CASES}" + else + SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A5_ONLY_CASES}" fi echo "STAGE=${STAGE}" >> "${GITHUB_ENV}" diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 4d0d38c81..866bcdd89 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -155,7 +155,8 @@ process_one_dir() { fi # Qwen3 tilelet kernels currently serve as direct ptoas compile-regression # coverage. Default them to A5/level3 lowering when the caller does not - # provide an explicit arch, but let A3/A5 callers override PTOAS_FLAGS. + # provide an explicit arch, and skip them entirely when the caller forces an + # A3 lowering path because the samples use A5-only matmul tile layouts. if [[ "$A" == "Qwen3Tilelet" ]]; then use_ptobc_roundtrip=0 fi @@ -221,6 +222,17 @@ process_one_dir() { echo -e "${A}\tSKIP\tMissing dir: $dir" return 0 fi + if [[ "$A" == "Qwen3Tilelet" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then + local qwen_case + for qwen_case in "$dir"/*.pto; do + [[ -f "$qwen_case" ]] || continue + case "$qwen_case" in + *-pto-ir.pto) continue ;; + esac + echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires --pto-arch=a5" + done + return 0 + fi # Run every .py file in this directory (no requirement that name matches folder). local f mlir ptobc_file decoded_pto cpp base overall=0 From b5c7fc455f12ad009edddac5f4ad33d65fdca113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=89=A2=E5=A4=A7?= Date: Mon, 13 Apr 2026 15:02:00 +0800 Subject: [PATCH 16/16] test: preserve qwen tilelet level3 under arch overrides --- test/samples/runop.sh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 866bcdd89..fde9e29bb 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -180,6 +180,7 @@ process_one_dir() { local target_arch="a3" local has_pto_arch_override=0 + local has_pto_level_override=0 if ((${#ptoas_flags[@]})); then for ((idx=0; idx<${#ptoas_flags[@]}; ++idx)); do if [[ "${ptoas_flags[idx]}" == "--pto-arch" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then @@ -188,12 +189,21 @@ process_one_dir() { elif [[ "${ptoas_flags[idx]}" == --pto-arch=* ]]; then target_arch="${ptoas_flags[idx]#--pto-arch=}" has_pto_arch_override=1 + elif [[ "${ptoas_flags[idx]}" == "--pto-level" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then + has_pto_level_override=1 + elif [[ "${ptoas_flags[idx]}" == --pto-level=* ]]; then + has_pto_level_override=1 fi done fi - if [[ "$A" == "Qwen3Tilelet" && $has_pto_arch_override -eq 0 ]]; then - ptoas_flags+=(--pto-arch a5 --pto-level=level3) - target_arch="a5" + if [[ "$A" == "Qwen3Tilelet" ]]; then + if [[ $has_pto_arch_override -eq 0 ]]; then + ptoas_flags+=(--pto-arch a5) + target_arch="a5" + fi + if [[ $has_pto_level_override -eq 0 ]]; then + ptoas_flags+=(--pto-level=level3) + fi fi local expected_vec_barrier="pipe_barrier(PIPE_V)" local skip_vec_barrier=0