libxsmm · jopperm · May 21, 2026 · May 26, 2026 · May 26, 2026
diff --git a/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/10_3D_tensor_matrix_multiplication.py
@@ -0,0 +1,139 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def _configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=3,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(configs=_configs(), key=["M", "N", "K"])
+@triton.jit
+def _matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_tile = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_tile = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_tile, b_tile)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        batch, m, k = A.shape
+        _, l = B.shape
+
+        a = A.to(torch.bfloat16).contiguous()
+        b = B.to(torch.bfloat16).contiguous()
+
+        a_flat = a.reshape(batch * m, k)
+        total_m = batch * m
+
+        c_flat = torch.empty((total_m, l), device=a.device, dtype=torch.bfloat16)
+
+        def grid(META):
+            return (
+                triton.cdiv(total_m, META["BLOCK_M"]) * triton.cdiv(l, META["BLOCK_N"]),
+            )
+
+        _matmul_kernel[grid](
+            a_flat,
+            b,
+            c_flat,
+            total_m,
+            l,
+            k,
+            a_flat.stride(0),
+            a_flat.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c_flat.stride(0),
+            c_flat.stride(1),
+        )
+
+        return c_flat.reshape(batch, m, l)
diff --git a/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py b/backends/triton/cpu/KernelBench/level1/11_4D_tensor_matrix_multiplication.py
@@ -0,0 +1,158 @@
+# ruff: noqa: E731, E741
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def swizzle_tile(
+    tile_id,
+    M,
+    N,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    width = GROUP_SIZE_M * grid_n
+    group_id = tile_id // width
+    group_size = tl.minimum(GROUP_SIZE_M, grid_m - group_id * GROUP_SIZE_M)
+    pid_m = group_id * GROUP_SIZE_M + ((tile_id % width) % group_size)
+    pid_n = (tile_id % width) // group_size
+    return pid_m, pid_n
+
+
+def get_autotune_configs():
+    return [
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 256, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=32,
+            num_stages=2,
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def _gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    pid_m, pid_n = swizzle_tile(pid, M, N, BLOCK_M, BLOCK_N, GROUP_SIZE_M)
+
+    a_desc = tl.make_tensor_descriptor(
+        base=a_ptr,
+        shape=(M, K),
+        strides=(stride_am, stride_ak),
+        block_shape=(BLOCK_M, BLOCK_K),
+    )
+    b_desc = tl.make_tensor_descriptor(
+        base=b_ptr,
+        shape=(K, N),
+        strides=(stride_bk, stride_bn),
+        block_shape=(BLOCK_K, BLOCK_N),
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for off_k in range(0, K, BLOCK_K):
+        a_block = a_desc.load([pid_m * BLOCK_M, off_k])
+        b_block = b_desc.load([off_k, pid_n * BLOCK_N])
+        acc += tl.dot(a_block, b_block)
+    c_desc = tl.make_tensor_descriptor(
+        base=c_ptr,
+        shape=(M, N),
+        strides=(stride_cm, stride_cn),
+        block_shape=(BLOCK_M, BLOCK_N),
+    )
+    c_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], acc.to(c_ptr.type.element_ty))
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        b_dim, i_dim, j_dim, l_dim = A.shape
+        k_dim = B.shape[1]
+
+        A_flat = A.contiguous().view(-1, l_dim)
+        if A_flat.dtype != torch.bfloat16:
+            A_flat = A_flat.to(torch.bfloat16)
+        B_fp16 = B.contiguous()
+        if B_fp16.dtype != torch.bfloat16:
+            B_fp16 = B_fp16.to(torch.bfloat16)
+
+        M = A_flat.shape[0]
+        N = k_dim
+        K = l_dim
+
+        C_2d = torch.empty((M, N), device=A.device, dtype=torch.bfloat16)
+
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+        )
+
+        _gemm_kernel[grid](
+            A_flat,
+            B_fp16,
+            C_2d,
+            M,
+            N,
+            K,
+            A_flat.stride(0),
+            A_flat.stride(1),
+            B_fp16.stride(0),
+            B_fp16.stride(1),
+            C_2d.stride(0),
+            C_2d.stride(1),
+        )
+
+        result = C_2d.view(b_dim, i_dim, j_dim, k_dim)
+        if A.dtype != torch.bfloat16:
+            result = result.to(A.dtype)
+        return result
diff --git a/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py b/backends/triton/cpu/KernelBench/level1/12_Matmul_with_diagonal_matrices_.py
@@ -0,0 +1,85 @@
+# ruff: noqa: E731
+# AUTOGENERATED KERNEL (LLM)
+# Source: LLM-generated candidate implementation
+# Status: Experimental / uncurated
+# Expectation: Correctness-first, performance not representative
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_M": 256, "BLOCK_N": 256}, num_warps=32, num_stages=2),
+        triton.Config({"BLOCK_M": 256, "BLOCK_N": 256}, num_warps=32, num_stages=3),
+        triton.Config({"BLOCK_M": 256, "BLOCK_N": 128}, num_warps=32, num_stages=2),
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_warps=32, num_stages=2),
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=32, num_stages=2),
+    ],
+    key=["N", "M"],
+)
+@triton.jit
+def _diag_matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    N,
+    M,
+    stride_bn,
+    stride_bm,
+    stride_cn,
+    stride_cm,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_m = tl.program_id(1)
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    mask_n = offs_n < N
+    mask_m = offs_m < M
+
+    a_vals = tl.load(a_ptr + offs_n, mask=mask_n, other=0.0)
+
+    b_ptrs = b_ptr + offs_n[:, None] * stride_bn + offs_m[None, :] * stride_bm
+    mask = mask_n[:, None] & mask_m[None, :]
+    b_vals = tl.load(b_ptrs, mask=mask, other=0.0)
+
+    c_vals = a_vals[:, None].to(tl.float32) * b_vals.to(tl.float32)
+
+    c_ptrs = c_ptr + offs_n[:, None] * stride_cn + offs_m[None, :] * stride_cm
+    tl.store(c_ptrs, c_vals.to(tl.bfloat16), mask=mask)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A, B):
+        N = A.shape[0]
+        M = B.shape[1]
+
+        C = torch.empty((N, M), device=A.device, dtype=A.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(N, META["BLOCK_N"]),
+            triton.cdiv(M, META["BLOCK_M"]),
+        )
+
+        _diag_matmul_kernel[grid](
+            A,
+            B,
+            C,
+            N,
+            M,
+            B.stride(0),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+        )
+
+        return C