From 9fe074395fc75018daa6a2cf3e4d1c15a77ec498 Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Tue, 19 May 2026 10:36:34 +0100
Subject: [PATCH] [KB] Add smoke test to check coverage

Compilation / execution coverage in yaml by means of `warning` entries.
No performance checks so far (still only the first two fast).
---
 examples/end-to-end/KernelBench/level1.yaml   | 43 ++++++-------------
 examples/end-to-end/KernelBench/level2.yaml   | 12 ++++++
 .../KernelBench/test_kernel_bench.py          | 27 +++++++++---
 3 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/examples/end-to-end/KernelBench/level1.yaml b/examples/end-to-end/KernelBench/level1.yaml
index 4a9d2e6c..4fc8c470 100644
--- a/examples/end-to-end/KernelBench/level1.yaml
+++ b/examples/end-to-end/KernelBench/level1.yaml
@@ -64,9 +64,7 @@
   initializations: [rnd, rnd]
   output_shape: 1024x1024
   dtypes: [f32, bf16]
-  # gflops: (1024 * 32 * 1024 * 2) / 1e9
-  # pipeline: matmul
-  warning: "Optimized pipeline error: too many tiles provided, expected at most 3 found 4"
+  gflops: (1024 * 32 * 1024 * 2) / 1e9
 
 - kernel: level1/10_3D_tensor_matrix_multiplication.py
   input_shapes: [16x1024x2048, 2048x768]
@@ -88,8 +86,7 @@
   output_shape: 4096x4096
   dtypes: [f32, bf16]
   warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError:
-              Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics:
-              python exception: Failure while executing pass pipeline'''
+              error: failed to legalize operation torch.operator that was explicitly marked illegal'''
 
 - kernel: level1/13_Matmul_for_symmetric_matrices.py
   input_shapes: [4096x4096, 4096x4096]
@@ -107,7 +104,6 @@
   gflops: (4096 * 4096 * 4096 * 2) / 1e9
   pipeline: matmul
 
-
 - kernel: level1/15_Matmul_for_lower_triangular_matrices.py
   input_shapes: [4096x4096, 4096x4096]
   initializations: [rnd, rnd]
@@ -116,7 +112,6 @@
   gflops: (4096 * 4096 * 4096 * 2) / 1e9
   pipeline: matmul
 
-
 - kernel: level1/16_Matmul_with_transposed_A.py
   input_shapes: [8192x2048, 8192x4096]
   initializations: [rnd, rnd]
@@ -148,7 +143,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/20_LeakyReLU.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -156,7 +150,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/21_Sigmoid.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -164,7 +157,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/22_Tanh.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -172,7 +164,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/23_Softmax.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -180,7 +171,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/24_LogSoftmax.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -188,7 +178,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/25_Swish.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -196,14 +185,14 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/26_GELU_.py
   input_shapes: [4096x393216]
   initializations: [rnd]
   output_shape: 4096x393216
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
-
+  warning: '''error: cannot be converted to LLVM IR:
+              missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf'''
 
 - kernel: level1/27_SELU_.py
   input_shapes: [4096x393216]
@@ -212,7 +201,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/28_HardSigmoid.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -220,7 +208,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/29_Softplus.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -228,7 +215,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/30_Softsign.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -236,7 +222,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/31_ELU.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -244,7 +229,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/32_HardTanh.py
   input_shapes: [4096x393216]
   initializations: [rnd]
@@ -252,7 +236,6 @@
   dtypes: [f32, bf16]
   gflops: (4096 * 393216) / 1e9
 
-
 - kernel: level1/33_BatchNorm.py
   input_shapes: [64x64x512x512]
   initializations: [rnd]
@@ -260,7 +243,6 @@
   dtypes: [f32, bf16]
   gflops: (64 * 64 * 512 * 512) / 1e9
 
-
 - kernel: level1/34_InstanceNorm.py
   input_shapes: [112x64x512x512]
   initializations: [rnd]
@@ -268,7 +250,6 @@
   dtypes: [f32, bf16]
   gflops: (112 * 64 * 512 * 512) / 1e9
 
-
 - kernel: level1/35_GroupNorm_.py
   input_shapes: [112x64x512x512]
   initializations: [rnd]
@@ -276,7 +257,6 @@
   dtypes: [f32, bf16]
   gflops: (112 * 64 * 512 * 512) / 1e9
 
-
 - kernel: level1/36_RMSNorm_.py
   input_shapes: [112x64x512x512]
   initializations: [rnd]
@@ -284,7 +264,6 @@
   dtypes: [f32, bf16]
   gflops: (112 * 64 * 512 * 512) / 1e9
 
-
 - kernel: level1/37_FrobeniusNorm_.py
   input_shapes: [112x64x512x512]
   initializations: [rnd]
@@ -292,7 +271,6 @@
   dtypes: [f32, bf16]
   gflops: (112 * 64 * 512 * 512) / 1e9
 
-
 - kernel: level1/38_L1Norm_.py
   input_shapes: [32768x65535]
   initializations: [rnd]
@@ -300,7 +278,6 @@
   dtypes: [f32, bf16]
   gflops: (32768 * 65535) / 1e9
 
-
 - kernel: level1/39_L2Norm_.py
   input_shapes: [32768x65535]
   initializations: [rnd]
@@ -308,7 +285,6 @@
   dtypes: [f32, bf16]
   gflops: (32768 * 65535) / 1e9
 
-
 - kernel: level1/40_LayerNorm.py
   input_shapes: [16x64x256x256]
   initializations: [rnd]
@@ -316,7 +292,6 @@
   dtypes: [f32, bf16]
   gflops: (16 * 64 * 256 * 256) / 1e9
 
-
 - kernel: level1/41_Max_Pooling_1D.py
   input_shapes: [64x192x65536]
   initializations: [rnd]
@@ -624,6 +599,7 @@
   output_shape: 32768x32768
   dtypes: [f32, bf16]
   gflops: (32768 * 32768) / 1e9
+  warning: "error: Dialect `tm_tensor' not found for custom op 'tm_tensor.scan'"
 
 - kernel: level1/90_cumprod.py
   input_shapes: [32768x32768]
@@ -631,6 +607,7 @@
   output_shape: 32768x32768
   dtypes: [f32, bf16]
   gflops: (32768 * 32768) / 1e9
+  warning: "error: Dialect `tm_tensor' not found for custom op 'tm_tensor.scan'"
 
 - kernel: level1/91_cumsum_reverse.py
   input_shapes: [32768x32768]
@@ -638,6 +615,7 @@
   output_shape: 32768x32768
   dtypes: [f32, bf16]
   gflops: (32768 * 32768) / 1e9
+  warning: "error: Dialect `tm_tensor' not found for custom op 'tm_tensor.scan'"
 
 - kernel: level1/92_cumsum_exclusive.py
   input_shapes: [32768x32768]
@@ -645,6 +623,7 @@
   output_shape: 32767x32769
   dtypes: [f32, bf16]
   gflops: (32768 * 32768) / 1e9
+  warning: "error: Dialect `tm_tensor' not found for custom op 'tm_tensor.scan'"
 
 - kernel: level1/93_masked_cumsum.py
   input_shapes: [32768x32768, 32768x32768]
@@ -652,6 +631,7 @@
   output_shape: 32768x32768
   dtypes: [f32, bf16]
   gflops: (32768 * 32768) / 1e9
+  warning: "error: Dialect `tm_tensor' not found for custom op 'tm_tensor.scan'"
 
 - kernel: level1/94_MSELoss.py
   input_shapes: [32768x32768, 32768x32768]
@@ -666,6 +646,7 @@
   output_shape: "1"
   dtypes: [f32, bf16]
   gflops: (32768 * 4096) / 1e9
+  warning: "RuntimeError: gather(): Expected dtype int32/int64 for index, but got torch.float32"
 
 - kernel: level1/96_HuberLoss.py
   input_shapes: [32768x32768, 32768x32768]
@@ -673,6 +654,8 @@
   output_shape: "1"
   dtypes: [f32, bf16]
   gflops: (32768 * 32768) / 1e9
+  warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError:
+              error: failed to legalize operation torch.operator that was explicitly marked illegal'''
 
 - kernel: level1/97_ScaledDotProductAttention.py
   input_shapes: [32x32x512x1024, 32x32x512x1024, 32x32x512x1024]
@@ -694,6 +677,8 @@
   output_shape: "1"
   dtypes: [f32, bf16]
   gflops: (32768 * 8192) / 1e9
+  warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError:
+              error: failed to legalize operation torch.operator that was explicitly marked illegal'''
 
 - kernel: level1/100_HingeLoss.py
   input_shapes: [32768x32768, 32768]
diff --git a/examples/end-to-end/KernelBench/level2.yaml b/examples/end-to-end/KernelBench/level2.yaml
index 1a455195..d48e137f 100644
--- a/examples/end-to-end/KernelBench/level2.yaml
+++ b/examples/end-to-end/KernelBench/level2.yaml
@@ -15,6 +15,7 @@
   initializations: [rnd]
   output_shape: 32x64x16x32x32
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/4_Conv2d_Mish_Mish.py
   input_shapes: [64x64x256x256]
@@ -39,6 +40,7 @@
   initializations: [rnd]
   output_shape: 64x32x30x62x62
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum.py
   input_shapes: [128x8x16x64x64]
@@ -112,6 +114,7 @@
   initializations: [rnd]
   output_shape: 128x64x258x258
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd.py
   input_shapes: [16x32x16x32x32]
@@ -203,6 +206,7 @@
   initializations: [rnd]
   output_shape: 32x64x32x64x64
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/35_Conv2d_Subtract_HardSwish_MaxPool_Mish.py
   input_shapes: [128x64x128x128]
@@ -215,6 +219,7 @@
   initializations: [rnd]
   output_shape: 16x1x1x256
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/37_Matmul_Swish_Sum_GroupNorm.py
   input_shapes: [32768x1024]
@@ -245,18 +250,21 @@
   initializations: [rnd]
   output_shape: 16384x4096
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/42_ConvTranspose2d_GlobalAvgPool_BiasAdd_LogSumExp_Sum_Multiply.py
   input_shapes: [16x64x512x512]
   initializations: [rnd]
   output_shape: 16x1
   dtypes: [f32, bf16]
+  warning: "error: failed to legalize operation torch.constant.bool"
 
 - kernel: level2/43_Conv3d_Max_LogSumExp_ReLU.py
   input_shapes: [4x32x32x128x128]
   initializations: [rnd]
   output_shape: 4x1x16x64x64
   dtypes: [f32, bf16]
+  warning: "error: failed to legalize operation torch.constant.bool"
 
 - kernel: level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py
   input_shapes: [16x64x128x128]
@@ -313,6 +321,8 @@
   initializations: [rnd]
   output_shape: 64x128x126x126
   dtypes: [f32, bf16]
+  warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError:
+              error: failed to legalize operation torch.operator that was explicitly marked illegal'''
 
 - kernel: level2/53_Gemm_Scaling_Hardtanh_GELU.py
   input_shapes: [2048x8192]
@@ -325,6 +335,7 @@
   initializations: [rnd]
   output_shape: 64x64x254x254
   dtypes: [f32, bf16]
+  warning: "missing `LLVMTranslationDialectInterface` registration for dialect for op: math.erf"
 
 - kernel: level2/55_Matmul_MaxPool_Sum_Scale.py
   input_shapes: [128x32768]
@@ -351,6 +362,7 @@
   initializations: [rnd]
   output_shape: 128x1x31x63x63
   dtypes: [f32, bf16]
+  warning: "'ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError: failed to legalize operation torch.constant.bool'"
 
 - kernel: level2/59_Matmul_Swish_Scaling.py
   input_shapes: [128x32768]
diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py
index e9aadf0b..c12e8483 100755
--- a/examples/end-to-end/KernelBench/test_kernel_bench.py
+++ b/examples/end-to-end/KernelBench/test_kernel_bench.py
@@ -58,6 +58,12 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
         # If a specific test is specified, only include that test
         if args.test and not test["kernel"].startswith(args.test):
             continue
+        # CI mode runs fewer tests for faster feedback
+        if args.ci and len(test_list) >= 5:
+            break
+        # Smoke tests run on the simplest lowering
+        if args.smoke_test:
+            test["pipeline"] = str(kb_default_pipeline)
         for dtype in test["dtypes"]:
             if not args.bf16 and dtype == "bf16":
                 continue
@@ -78,9 +84,6 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
                     "warning": test.get("warning", None),
                 }
             )
-            # CI mode runs fewer tests for faster feedback
-            if args.ci and len(test_list) >= 5:
-                return test_list
     return test_list
 
 
@@ -110,7 +113,7 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
     Parser.add_argument(
         "--ci",
         action=argparse.BooleanOptionalAction,
-        help="Enable CI mode (faster run, fewer kernels).",
+        help="Enable CI mode (faster run, fewer kernels). Incompatible with --smoke-test.",
     )
     Parser.add_argument(
         "--test",
@@ -122,7 +125,17 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
         action=argparse.BooleanOptionalAction,
         help="Whether to print the MLIR module after all stages. Default is False.",
     )
+    Parser.add_argument(
+        "--smoke-test",
+        action=argparse.BooleanOptionalAction,
+        help="Runs every kernel with loops lowering to pipe clean.",
+    )
     args = Parser.parse_args()
+    if args.smoke_test and args.ci:
+        print("\nERROR: Smoke test and CI mode are incompatible.\n")
+        Parser.print_help()
+        exit(1)
+
     tests = get_tests(args)
     if len(tests) == 0:
         if args.test:
@@ -181,7 +194,11 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
             print(result.stderr)
 
         print(f"Return code: {result.returncode}")
-        assert result.returncode == 0, "Execution failed"
+
+        # Only stop on failure on normal runs.
+        # Smoke tests try to run as much as possible.
+        if not args.smoke_test:
+            assert result.returncode == 0, "Execution failed"
 
 # CHECK: 1_Square_matrix_multiplication_.mlir
 # CHECK: 0.3745{{.*}} 0.9507{{.*}} 0.7319{{.*}} ... 0.2973{{.*}} 0.9243{{.*}} 0.9710{{.*}}